postgresql/src/include/lib/radixtree.h

/*-------------------------------------------------------------------------
 *
 * radixtree.h
 *		Template for adaptive radix tree.
 *
 * A template to generate an "adaptive radix tree", specialized for value
 * types and for local/shared memory.
 *
 * The concept originates from the paper "The Adaptive Radix Tree: ARTful
 * Indexing for Main-Memory Databases" by Viktor Leis, Alfons Kemper,
 * and Thomas Neumann, 2013.
 *
 * Radix trees have some advantages over hash tables:
 * - The keys are logically ordered, allowing efficient sorted iteration
 *   and range queries
 * - Operations using keys that are lexicographically close together
 *   will have favorable memory locality
 * - Memory use grows gradually rather than by doubling
 * - The key does not need to be stored with the value, since the key
 *   is implicitly contained in the path to the value
 *
 * Some disadvantages are:
 * - Point queries (along with insertion and deletion) are slower than
 *   a linear probing hash table as in simplehash.h
 * - Memory usage varies by key distribution, so is difficult to predict
 *
 * A classic radix tree consists of nodes, each containing an array of
 * pointers to child nodes.  The size of the array is determined by the
 * "span" of the tree, which is the number of bits of the key used to
 * index into the array.  For example, with a span of 6, a "chunk"
 * of 6 bits is extracted from the key at each node traversal, and
 * the arrays thus have a "fanout" of 2^6 or 64 entries.  A large span
 * allows a shorter tree, but requires larger arrays that may be mostly
 * wasted space.
 *
 * The key idea of the adaptive radix tree is to choose different
 * data structures based on the number of child nodes. A node will
 * start out small when it is first populated, and when it is full,
 * it is replaced by the next larger size. Conversely, when a node
 * becomes mostly empty, it is replaced by the next smaller node. The
 * bulk of the code complexity in this module stems from this dynamic
 * switching. One mitigating factor is using a span of 8, since bytes
 * are directly addressable.
 *
 * The ART paper mentions three ways to implement leaves:
 *
 * "- Single-value leaves: The values are stored using an addi-
 *  tional leaf node type which stores one value.
 *  - Multi-value leaves: The values are stored in one of four
 *  different leaf node types, which mirror the structure of
 *  inner nodes, but contain values instead of pointers.
 *  - Combined pointer/value slots: If values fit into point-
 *  ers, no separate node types are necessary. Instead, each
 *  pointer storage location in an inner node can either
 *  store a pointer or a value."
 *
 * We use a form of "combined pointer/value slots", as recommended. Values
 * of size (if fixed at compile time) equal or smaller than the platform's
 * pointer type are stored in the child slots of the last level node,
 * while larger values are the same as "single-value" leaves above. This
 * offers flexibility and efficiency. Variable-length types are currently
 * treated as single-value leaves for simplicity, but future work may
 * allow those to be stored in the child pointer arrays, when they're
 * small enough.
 *
 * There are two other techniques described in the paper that are not
 * implemented here:
 * - path compression "...removes all inner nodes that have only a single child."
 * - lazy path expansion "...inner nodes are only created if they are required
 *   to distinguish at least two leaf nodes."
 *
 * We do have a form of "poor man's path compression", however, enabled by
 * only supporting unsigned integer keys (for now assumed to be 64-bit):
 * A tree doesn't contain paths where the highest bytes of all keys are
 * zero. That way, the tree's height adapts to the distribution of keys.
 *
 * To handle concurrency, we use a single reader-writer lock for the
 * radix tree. If concurrent write operations are possible, the tree
 * must be exclusively locked during write operations such as RT_SET()
 * and RT_DELETE(), and share locked during read operations such as
 * RT_FIND() and RT_BEGIN_ITERATE().
 *
 * TODO: The current locking mechanism is not optimized for high
 * concurrency with mixed read-write workloads. In the future it might
 * be worthwhile to replace it with the Optimistic Lock Coupling or
 * ROWEX mentioned in the paper "The ART of Practical Synchronization"
 * by the same authors as the ART paper, 2016.
 *
 * To generate a radix tree and associated functions for a use case
 * several macros have to be #define'ed before this file is included.
 * Including the file #undef's all those, so a new radix tree can be
 * generated afterwards.
 *
 * The relevant parameters are:
 * - RT_PREFIX - prefix for all symbol names generated. A prefix of "foo"
 * 	 will result in radix tree type "foo_radix_tree" and functions like
 *	 "foo_create"/"foo_free" and so forth.
 * - RT_DECLARE - if defined function prototypes and type declarations are
 *	 generated
 * - RT_DEFINE - if defined function definitions are generated
 * - RT_SCOPE - in which scope (e.g. extern, static inline) do function
 *	 declarations reside
 * - RT_VALUE_TYPE - the type of the value.
 * - RT_VARLEN_VALUE_SIZE() - for variable length values, an expression
 *   involving a pointer to the value type, to calculate size.
 *     NOTE: implies that the value is in fact variable-length,
 *     so do not set for fixed-length values.
 * - RT_RUNTIME_EMBEDDABLE_VALUE - for variable length values, allows
 *   storing the value in a child pointer slot, rather than as a single-
 *   value leaf, if small enough. This requires that the value, when
 *   read as a child pointer, can be tagged in the lowest bit.
 *
 * Optional parameters:
 * - RT_SHMEM - if defined, the radix tree is created in the DSA area
 *	 so that multiple processes can access it simultaneously.
 * - RT_DEBUG - if defined add stats tracking and debugging functions
 *
 * Interface
 * ---------
 *
 * RT_CREATE		- Create a new, empty radix tree
 * RT_FREE			- Free the radix tree
 * RT_FIND			- Lookup the value for a given key
 * RT_SET			- Set a key-value pair
 * RT_BEGIN_ITERATE	- Begin iterating through all key-value pairs
 * RT_ITERATE_NEXT	- Return next key-value pair, if any
 * RT_END_ITERATE	- End iteration
 * RT_MEMORY_USAGE	- Get the memory as measured by space in memory context blocks
 *
 * Interface for Shared Memory
 * ---------
 *
 * RT_ATTACH		- Attach to the radix tree
 * RT_DETACH		- Detach from the radix tree
 * RT_LOCK_EXCLUSIVE - Lock the radix tree in exclusive mode
 * RT_LOCK_SHARE 	- Lock the radix tree in share mode
 * RT_UNLOCK		- Unlock the radix tree
 * RT_GET_HANDLE	- Return the handle of the radix tree
 *
 * Optional Interface
 * ---------
 *
 * RT_DELETE		- Delete a key-value pair. Declared/defined if RT_USE_DELETE is defined
 *
 *
 * Copyright (c) 2024, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  src/include/lib/radixtree.h
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "nodes/bitmapset.h"
#include "port/pg_bitutils.h"
#include "port/simd.h"
#include "utils/dsa.h"
#include "utils/memutils.h"

/* helpers */
#define RT_MAKE_PREFIX(a) CppConcat(a,_)
#define RT_MAKE_NAME(name) RT_MAKE_NAME_(RT_MAKE_PREFIX(RT_PREFIX),name)
#define RT_MAKE_NAME_(a,b) CppConcat(a,b)
/*
 * stringify a macro constant, from https://gcc.gnu.org/onlinedocs/cpp/Stringizing.html
 */
#define RT_STR(s) RT_STR_(s)
#define RT_STR_(s) #s

/* function declarations */
#define RT_CREATE RT_MAKE_NAME(create)
#define RT_FREE RT_MAKE_NAME(free)
#define RT_FIND RT_MAKE_NAME(find)
#ifdef RT_SHMEM
#define RT_ATTACH RT_MAKE_NAME(attach)
#define RT_DETACH RT_MAKE_NAME(detach)
#define RT_GET_HANDLE RT_MAKE_NAME(get_handle)
#define RT_LOCK_EXCLUSIVE RT_MAKE_NAME(lock_exclusive)
#define RT_LOCK_SHARE RT_MAKE_NAME(lock_share)
#define RT_UNLOCK RT_MAKE_NAME(unlock)
#endif
#define RT_SET RT_MAKE_NAME(set)
#define RT_BEGIN_ITERATE RT_MAKE_NAME(begin_iterate)
#define RT_ITERATE_NEXT RT_MAKE_NAME(iterate_next)
#define RT_END_ITERATE RT_MAKE_NAME(end_iterate)
#ifdef RT_USE_DELETE
#define RT_DELETE RT_MAKE_NAME(delete)
#endif
#define RT_MEMORY_USAGE RT_MAKE_NAME(memory_usage)
#define RT_DUMP_NODE RT_MAKE_NAME(dump_node)
#define RT_STATS RT_MAKE_NAME(stats)

/* internal helper functions (no externally visible prototypes) */
#define RT_CHILDPTR_IS_VALUE RT_MAKE_NAME(childptr_is_value)
#define RT_VALUE_IS_EMBEDDABLE RT_MAKE_NAME(value_is_embeddable)
#define RT_GET_SLOT_RECURSIVE RT_MAKE_NAME(get_slot_recursive)
#define RT_DELETE_RECURSIVE RT_MAKE_NAME(delete_recursive)
#define RT_ALLOC_NODE RT_MAKE_NAME(alloc_node)
#define RT_ALLOC_LEAF RT_MAKE_NAME(alloc_leaf)
#define RT_FREE_NODE RT_MAKE_NAME(free_node)
#define RT_FREE_LEAF RT_MAKE_NAME(free_leaf)
#define RT_FREE_RECURSE RT_MAKE_NAME(free_recurse)
#define RT_EXTEND_UP RT_MAKE_NAME(extend_up)
#define RT_EXTEND_DOWN RT_MAKE_NAME(extend_down)
#define RT_COPY_COMMON RT_MAKE_NAME(copy_common)
#define RT_PTR_SET_LOCAL RT_MAKE_NAME(ptr_set_local)
#define RT_NODE_16_SEARCH_EQ RT_MAKE_NAME(node_16_search_eq)
#define RT_NODE_4_GET_INSERTPOS RT_MAKE_NAME(node_4_get_insertpos)
#define RT_NODE_16_GET_INSERTPOS RT_MAKE_NAME(node_16_get_insertpos)
#define RT_SHIFT_ARRAYS_FOR_INSERT RT_MAKE_NAME(shift_arrays_for_insert)
#define RT_SHIFT_ARRAYS_AND_DELETE RT_MAKE_NAME(shift_arrays_and_delete)
#define RT_COPY_ARRAYS_FOR_INSERT RT_MAKE_NAME(copy_arrays_for_insert)
#define RT_COPY_ARRAYS_AND_DELETE RT_MAKE_NAME(copy_arrays_and_delete)
#define RT_NODE_48_IS_CHUNK_USED RT_MAKE_NAME(node_48_is_chunk_used)
#define RT_NODE_48_GET_CHILD RT_MAKE_NAME(node_48_get_child)
#define RT_NODE_256_IS_CHUNK_USED RT_MAKE_NAME(node_256_is_chunk_used)
#define RT_NODE_256_GET_CHILD RT_MAKE_NAME(node_256_get_child)
#define RT_KEY_GET_SHIFT RT_MAKE_NAME(key_get_shift)
#define RT_SHIFT_GET_MAX_VAL RT_MAKE_NAME(shift_get_max_val)
#define RT_NODE_SEARCH RT_MAKE_NAME(node_search)
#define RT_NODE_DELETE RT_MAKE_NAME(node_delete)
#define RT_NODE_INSERT RT_MAKE_NAME(node_insert)
#define RT_ADD_CHILD_4 RT_MAKE_NAME(add_child_4)
#define RT_ADD_CHILD_16 RT_MAKE_NAME(add_child_16)
#define RT_ADD_CHILD_48 RT_MAKE_NAME(add_child_48)
#define RT_ADD_CHILD_256 RT_MAKE_NAME(add_child_256)
#define RT_GROW_NODE_4 RT_MAKE_NAME(grow_node_4)
#define RT_GROW_NODE_16 RT_MAKE_NAME(grow_node_16)
#define RT_GROW_NODE_48 RT_MAKE_NAME(grow_node_48)
#define RT_REMOVE_CHILD_4 RT_MAKE_NAME(remove_child_4)
#define RT_REMOVE_CHILD_16 RT_MAKE_NAME(remove_child_16)
#define RT_REMOVE_CHILD_48 RT_MAKE_NAME(remove_child_48)
#define RT_REMOVE_CHILD_256 RT_MAKE_NAME(remove_child_256)
#define RT_SHRINK_NODE_16 RT_MAKE_NAME(shrink_child_16)
#define RT_SHRINK_NODE_48 RT_MAKE_NAME(shrink_child_48)
#define RT_SHRINK_NODE_256 RT_MAKE_NAME(shrink_child_256)
#define RT_NODE_ITERATE_NEXT RT_MAKE_NAME(node_iterate_next)
#define RT_VERIFY_NODE RT_MAKE_NAME(verify_node)

/* type declarations */
#define RT_RADIX_TREE RT_MAKE_NAME(radix_tree)
#define RT_RADIX_TREE_CONTROL RT_MAKE_NAME(radix_tree_control)
#define RT_ITER RT_MAKE_NAME(iter)
#ifdef RT_SHMEM
#define RT_HANDLE RT_MAKE_NAME(handle)
#endif
#define RT_NODE RT_MAKE_NAME(node)
#define RT_CHILD_PTR RT_MAKE_NAME(child_ptr)
#define RT_NODE_ITER RT_MAKE_NAME(node_iter)
#define RT_NODE_4 RT_MAKE_NAME(node_4)
#define RT_NODE_16 RT_MAKE_NAME(node_16)
#define RT_NODE_48 RT_MAKE_NAME(node_48)
#define RT_NODE_256 RT_MAKE_NAME(node_256)
#define RT_SIZE_CLASS RT_MAKE_NAME(size_class)
#define RT_SIZE_CLASS_ELEM RT_MAKE_NAME(size_class_elem)
#define RT_SIZE_CLASS_INFO RT_MAKE_NAME(size_class_info)
#define RT_CLASS_4 RT_MAKE_NAME(class_4)
#define RT_CLASS_16_LO RT_MAKE_NAME(class_32_min)
#define RT_CLASS_16_HI RT_MAKE_NAME(class_32_max)
#define RT_CLASS_48 RT_MAKE_NAME(class_48)
#define RT_CLASS_256 RT_MAKE_NAME(class_256)

/* generate forward declarations necessary to use the radix tree */
#ifdef RT_DECLARE

typedef struct RT_RADIX_TREE RT_RADIX_TREE;
typedef struct RT_ITER RT_ITER;

#ifdef RT_SHMEM
typedef dsa_pointer RT_HANDLE;
#endif

#ifdef RT_SHMEM
RT_SCOPE	RT_RADIX_TREE *RT_CREATE(MemoryContext ctx, dsa_area *dsa, int tranche_id);
RT_SCOPE	RT_RADIX_TREE *RT_ATTACH(dsa_area *dsa, dsa_pointer dp);
RT_SCOPE void RT_DETACH(RT_RADIX_TREE * tree);
RT_SCOPE	RT_HANDLE RT_GET_HANDLE(RT_RADIX_TREE * tree);
RT_SCOPE void RT_LOCK_EXCLUSIVE(RT_RADIX_TREE * tree);
RT_SCOPE void RT_LOCK_SHARE(RT_RADIX_TREE * tree);
RT_SCOPE void RT_UNLOCK(RT_RADIX_TREE * tree);
#else
RT_SCOPE	RT_RADIX_TREE *RT_CREATE(MemoryContext ctx);
#endif
RT_SCOPE void RT_FREE(RT_RADIX_TREE * tree);

RT_SCOPE	RT_VALUE_TYPE *RT_FIND(RT_RADIX_TREE * tree, uint64 key);
RT_SCOPE bool RT_SET(RT_RADIX_TREE * tree, uint64 key, RT_VALUE_TYPE * value_p);

#ifdef RT_USE_DELETE
RT_SCOPE bool RT_DELETE(RT_RADIX_TREE * tree, uint64 key);
#endif

RT_SCOPE	RT_ITER *RT_BEGIN_ITERATE(RT_RADIX_TREE * tree);
RT_SCOPE	RT_VALUE_TYPE *RT_ITERATE_NEXT(RT_ITER * iter, uint64 *key_p);
RT_SCOPE void RT_END_ITERATE(RT_ITER * iter);

RT_SCOPE uint64 RT_MEMORY_USAGE(RT_RADIX_TREE * tree);

#ifdef RT_DEBUG
RT_SCOPE void RT_STATS(RT_RADIX_TREE * tree);
#endif

#endif							/* RT_DECLARE */


/* generate implementation of the radix tree */
#ifdef RT_DEFINE

/* The number of bits encoded in one tree level */
#define RT_SPAN	BITS_PER_BYTE

/*
 * The number of possible partial keys, and thus the maximum number of
 * child pointers, for a node.
 */
#define RT_NODE_MAX_SLOTS (1 << RT_SPAN)

/* Mask for extracting a chunk from a key */
#define RT_CHUNK_MASK ((1 << RT_SPAN) - 1)

/* Maximum shift needed to extract a chunk from a key */
#define RT_MAX_SHIFT	RT_KEY_GET_SHIFT(UINT64_MAX)

/* Maximum level a tree can reach for a key */
#define RT_MAX_LEVEL	((sizeof(uint64) * BITS_PER_BYTE) / RT_SPAN)

/* Get a chunk from the key */
#define RT_GET_KEY_CHUNK(key, shift) ((uint8) (((key) >> (shift)) & RT_CHUNK_MASK))

/* For accessing bitmaps */
#define RT_BM_IDX(x)	((x) / BITS_PER_BITMAPWORD)
#define RT_BM_BIT(x)	((x) % BITS_PER_BITMAPWORD)

/*
 * Node kinds
 *
 * The different node kinds are what make the tree "adaptive".
 *
 * Each node kind is associated with a different datatype and different
 * search/set/delete/iterate algorithms adapted for its size. The largest
 * kind, node256 is basically the same as a traditional radix tree,
 * and would be most wasteful of memory when sparsely populated. The
 * smaller nodes expend some additional CPU time to enable a smaller
 * memory footprint.
 *
 * NOTE: There are 4 node kinds, and this should never be increased,
 * for several reasons:
 * 1. With 5 or more kinds, gcc tends to use a jump table for switch
 *    statements.
 * 2. The 4 kinds can be represented with 2 bits, so we have the option
 *    in the future to tag the node pointer with the kind, even on
 *    platforms with 32-bit pointers. That would touch fewer cache lines
 *    during traversal and allow faster recovery from branch mispredicts.
 * 3. We can have multiple size classes per node kind.
 */
#define RT_NODE_KIND_4			0x00
#define RT_NODE_KIND_16			0x01
#define RT_NODE_KIND_48			0x02
#define RT_NODE_KIND_256		0x03
#define RT_NODE_KIND_COUNT		4

/*
 * Calculate the slab block size so that we can allocate at least 32 chunks
 * from the block.
 */
#define RT_SLAB_BLOCK_SIZE(size)	\
	Max(SLAB_DEFAULT_BLOCK_SIZE, pg_nextpower2_32(size * 32))

/* Common header for all nodes */
typedef struct RT_NODE
{
	/* Node kind, one per search/set algorithm */
	uint8		kind;

	/*
	 * Max capacity for the current size class. Storing this in the node
	 * enables multiple size classes per node kind. uint8 is sufficient for
	 * all node kinds, because we only use this number to test if the node
	 * needs to grow. Since node256 never needs to grow, we let this overflow
	 * to zero.
	 */
	uint8		fanout;

	/*
	 * Number of children. uint8 is sufficient for all node kinds, because
	 * nodes shrink when this number gets lower than some threshold. Since
	 * node256 cannot possibly have zero children, we let the counter overflow
	 * and we interpret zero as "256" for this node kind.
	 */
	uint8		count;
}			RT_NODE;


/* pointer returned by allocation */
#ifdef RT_SHMEM
#define RT_PTR_ALLOC dsa_pointer
#define RT_INVALID_PTR_ALLOC InvalidDsaPointer
#define RT_PTR_ALLOC_IS_VALID(ptr) DsaPointerIsValid(ptr)
#else
#define RT_PTR_ALLOC RT_NODE *
#define RT_INVALID_PTR_ALLOC NULL
#define RT_PTR_ALLOC_IS_VALID(ptr) PointerIsValid(ptr)
#endif

/*
 * A convenience type used when we need to work with a DSA pointer as well
 * as its local pointer. For local memory, both members are the same, so
 * we use a union.
 */
#ifdef RT_SHMEM
typedef struct RT_CHILD_PTR
#else
typedef union RT_CHILD_PTR
#endif
{
	RT_PTR_ALLOC alloc;
	RT_NODE    *local;
}			RT_CHILD_PTR;


/*
 * Helper macros and functions for value storage.
 * We either embed values in the child slots of the last level
 * node or store pointers to values to the child slots,
 * depending on the value size.
 */

#ifdef RT_VARLEN_VALUE_SIZE
#define RT_GET_VALUE_SIZE(v) RT_VARLEN_VALUE_SIZE(v)
#else
#define RT_GET_VALUE_SIZE(v) sizeof(RT_VALUE_TYPE)
#endif

/*
 * Return true if the value can be stored in the child array
 * of the lowest-level node, false otherwise.
 */
static inline bool
RT_VALUE_IS_EMBEDDABLE(RT_VALUE_TYPE * value_p)
{
#ifdef RT_VARLEN_VALUE_SIZE

#ifdef RT_RUNTIME_EMBEDDABLE_VALUE
	return RT_GET_VALUE_SIZE(value_p) <= sizeof(RT_PTR_ALLOC);
#else
	return false;
#endif

#else
	return RT_GET_VALUE_SIZE(value_p) <= sizeof(RT_PTR_ALLOC);
#endif
}

/*
 * Return true if the child pointer contains the value, false
 * if the child pointer is a leaf pointer.
 */
static inline bool
RT_CHILDPTR_IS_VALUE(RT_PTR_ALLOC child)
{
#ifdef RT_VARLEN_VALUE_SIZE

#ifdef RT_RUNTIME_EMBEDDABLE_VALUE
	/* check for pointer tag */
#ifdef RT_SHMEM
	return child & 1;
#else
	return ((uintptr_t) child) & 1;
#endif

#else
	return false;
#endif

#else
	return sizeof(RT_VALUE_TYPE) <= sizeof(RT_PTR_ALLOC);
#endif
}

/*
 * Symbols for maximum possible fanout are declared first as they are
 * required to declare each node kind. The declarations of other fanout
 * values are followed as they need the struct sizes of each node kind.
 */

/* max possible key chunks without struct padding */
#define RT_FANOUT_4_MAX (8 - sizeof(RT_NODE))

/* equal to two 128-bit SIMD registers, regardless of availability */
#define RT_FANOUT_16_MAX	32

/*
 * This also determines the number of bits necessary for the isset array,
 * so we need to be mindful of the size of bitmapword.  Since bitmapword
 * can be 64 bits, the only values that make sense here are 64 and 128.
 * The ART paper uses at most 64 for this node kind, and one advantage
 * for us is that "isset" is a single bitmapword on most platforms,
 * rather than an array, allowing the compiler to get rid of loops.
 */
#define RT_FANOUT_48_MAX 64

#define RT_FANOUT_256   RT_NODE_MAX_SLOTS

/*
 * Node structs, one for each "kind"
 */

/*
 * node4 and node16 use one array for key chunks and another
 * array of the same length for children. The keys and children
 * are stored at corresponding positions, sorted by chunk.
 */

typedef struct RT_NODE_4
{
	RT_NODE		base;

	uint8		chunks[RT_FANOUT_4_MAX];

	/* number of children depends on size class */
	RT_PTR_ALLOC children[FLEXIBLE_ARRAY_MEMBER];
}			RT_NODE_4;

typedef struct RT_NODE_16
{
	RT_NODE		base;

	uint8		chunks[RT_FANOUT_16_MAX];

	/* number of children depends on size class */
	RT_PTR_ALLOC children[FLEXIBLE_ARRAY_MEMBER];
}			RT_NODE_16;

/*
 * node48 uses a 256-element array indexed by key chunks. This array
 * stores indexes into a second array containing the children.
 */
typedef struct RT_NODE_48
{
	RT_NODE		base;

	/* The index of slots for each fanout */
	uint8		slot_idxs[RT_NODE_MAX_SLOTS];

/* Invalid index */
#define RT_INVALID_SLOT_IDX	0xFF

	/* bitmap to track which slots are in use */
	bitmapword	isset[RT_BM_IDX(RT_FANOUT_48_MAX)];

	/* number of children depends on size class */
	RT_PTR_ALLOC children[FLEXIBLE_ARRAY_MEMBER];
}			RT_NODE_48;

/*
 * node256 is the largest node type. This node has an array of
 * children directly indexed by chunk.  Unlike other node kinds,
 * its array size is by definition fixed.
 */
typedef struct RT_NODE_256
{
	RT_NODE		base;

	/* bitmap to track which slots are in use */
	bitmapword	isset[RT_BM_IDX(RT_FANOUT_256)];

	/* slots for 256 children */
	RT_PTR_ALLOC children[RT_FANOUT_256];
}			RT_NODE_256;

#if defined(RT_SHMEM)
/*
 * Make sure the all nodes (except for node256) fit neatly into a DSA
 * size class.  We assume the RT_FANOUT_4 is in the range where DSA size
 * classes increment by 8 (as of PG17 up to 64 bytes), so we just hard
 * code that one.
 */

#if SIZEOF_DSA_POINTER < 8
#define RT_FANOUT_16_LO	((96 - offsetof(RT_NODE_16, children)) / sizeof(RT_PTR_ALLOC))
#define RT_FANOUT_16_HI	Min(RT_FANOUT_16_MAX, (160 - offsetof(RT_NODE_16, children)) / sizeof(RT_PTR_ALLOC))
#define RT_FANOUT_48	Min(RT_FANOUT_48_MAX, (512 - offsetof(RT_NODE_48, children)) / sizeof(RT_PTR_ALLOC))
#else
#define RT_FANOUT_16_LO	((160 - offsetof(RT_NODE_16, children)) / sizeof(RT_PTR_ALLOC))
#define RT_FANOUT_16_HI	Min(RT_FANOUT_16_MAX, (320 - offsetof(RT_NODE_16, children)) / sizeof(RT_PTR_ALLOC))
#define RT_FANOUT_48	Min(RT_FANOUT_48_MAX, (768 - offsetof(RT_NODE_48, children)) / sizeof(RT_PTR_ALLOC))
#endif							/* SIZEOF_DSA_POINTER < 8 */

#else							/* ! RT_SHMEM */

/* doesn't really matter, but may as well use the namesake */
#define RT_FANOUT_16_LO	16
/* use maximum possible */
#define RT_FANOUT_16_HI RT_FANOUT_16_MAX
#define RT_FANOUT_48	RT_FANOUT_48_MAX

#endif							/* RT_SHMEM */

/*
 * To save memory in trees with sparse keys, it would make sense to have two
 * size classes for the smallest kind (perhaps a high class of 5 and a low class
 * of 2), but it would be more effective to utilize lazy expansion and
 * path compression.
 */
#define RT_FANOUT_4		4

StaticAssertDecl(RT_FANOUT_4 <= RT_FANOUT_4_MAX, "watch struct padding");
StaticAssertDecl(RT_FANOUT_16_LO < RT_FANOUT_16_HI, "LO subclass bigger than HI");
StaticAssertDecl(RT_FANOUT_48 <= RT_FANOUT_48_MAX, "more slots than isset bits");

/*
 * Node size classes
 *
 * Nodes of different kinds necessarily belong to different size classes.
 * One innovation in our implementation compared to the ART paper is
 * decoupling the notion of size class from kind.
 *
 * The size classes within a given node kind have the same underlying
 * type, but a variable number of children/values. This is possible
 * because each type (except node256) contains metadata that work the
 * same way regardless of how many child slots there are. The nodes
 * can introspect their allocated capacity at runtime.
 */
typedef enum RT_SIZE_CLASS
{
	RT_CLASS_4 = 0,
	RT_CLASS_16_LO,
	RT_CLASS_16_HI,
	RT_CLASS_48,
	RT_CLASS_256
}			RT_SIZE_CLASS;

/* Information for each size class */
typedef struct RT_SIZE_CLASS_ELEM
{
	const char *name;
	int			fanout;
	size_t		allocsize;
}			RT_SIZE_CLASS_ELEM;


static const RT_SIZE_CLASS_ELEM RT_SIZE_CLASS_INFO[] = {
	[RT_CLASS_4] = {
		.name = RT_STR(RT_PREFIX) "_radix_tree node4",
		.fanout = RT_FANOUT_4,
		.allocsize = sizeof(RT_NODE_4) + RT_FANOUT_4 * sizeof(RT_PTR_ALLOC),
	},
	[RT_CLASS_16_LO] = {
		.name = RT_STR(RT_PREFIX) "_radix_tree node16_lo",
		.fanout = RT_FANOUT_16_LO,
		.allocsize = sizeof(RT_NODE_16) + RT_FANOUT_16_LO * sizeof(RT_PTR_ALLOC),
	},
	[RT_CLASS_16_HI] = {
		.name = RT_STR(RT_PREFIX) "_radix_tree node16_hi",
		.fanout = RT_FANOUT_16_HI,
		.allocsize = sizeof(RT_NODE_16) + RT_FANOUT_16_HI * sizeof(RT_PTR_ALLOC),
	},
	[RT_CLASS_48] = {
		.name = RT_STR(RT_PREFIX) "_radix_tree node48",
		.fanout = RT_FANOUT_48,
		.allocsize = sizeof(RT_NODE_48) + RT_FANOUT_48 * sizeof(RT_PTR_ALLOC),
	},
	[RT_CLASS_256] = {
		.name = RT_STR(RT_PREFIX) "_radix_tree node256",
		.fanout = RT_FANOUT_256,
		.allocsize = sizeof(RT_NODE_256),
	},
};

#define RT_NUM_SIZE_CLASSES lengthof(RT_SIZE_CLASS_INFO)

#ifdef RT_SHMEM
/* A magic value used to identify our radix tree */
#define RT_RADIX_TREE_MAGIC 0x54A48167
#endif

/* Contains the actual tree, plus ancillary info */
typedef struct RT_RADIX_TREE_CONTROL
{
#ifdef RT_SHMEM
	RT_HANDLE	handle;
	uint32		magic;
	LWLock		lock;
#endif

	RT_PTR_ALLOC root;
	uint64		max_val;
	int64		num_keys;
	int			start_shift;

	/* statistics */
#ifdef RT_DEBUG
	int64		num_nodes[RT_NUM_SIZE_CLASSES];
	int64		num_leaves;
#endif
}			RT_RADIX_TREE_CONTROL;

/* Entry point for allocating and accessing the tree */
struct RT_RADIX_TREE
{
	MemoryContext context;

	/* pointing to either local memory or DSA */
	RT_RADIX_TREE_CONTROL *ctl;

#ifdef RT_SHMEM
	dsa_area   *dsa;
#else
	MemoryContextData *node_slabs[RT_NUM_SIZE_CLASSES];

	/* leaf_context is used only for single-value leaves */
	MemoryContextData *leaf_context;
#endif
	MemoryContextData *iter_context;
};

/*
 * Iteration support.
 *
 * Iterating over the radix tree produces each key/value pair in ascending
 * order of the key.
 */

/* state for iterating over a single node */
typedef struct RT_NODE_ITER
{
	RT_CHILD_PTR node;

	/*
	 * The next index of the chunk array in RT_NODE_KIND_4 and RT_NODE_KIND_16
	 * nodes, or the next chunk in RT_NODE_KIND_48 and RT_NODE_KIND_256 nodes.
	 * 0 for the initial value.
	 */
	int			idx;
}			RT_NODE_ITER;

/* state for iterating over the whole radix tree */
struct RT_ITER
{
	RT_RADIX_TREE *tree;

	/*
	 * A stack to track iteration for each level. Level 0 is the lowest (or
	 * leaf) level
	 */
	RT_NODE_ITER node_iters[RT_MAX_LEVEL];
	int			top_level;
	int			cur_level;

	/* The key constructed during iteration */
	uint64		key;
};


/* verification (available only in assert-enabled builds) */
static void RT_VERIFY_NODE(RT_NODE * node);

static inline void
RT_PTR_SET_LOCAL(RT_RADIX_TREE * tree, RT_CHILD_PTR * node)
{
#ifdef RT_SHMEM
	node->local = dsa_get_address(tree->dsa, node->alloc);
#endif
}

/* Convenience functions for node48 and node256 */

/* Return true if there is an entry for "chunk" */
static inline bool
RT_NODE_48_IS_CHUNK_USED(RT_NODE_48 * node, uint8 chunk)
{
	return node->slot_idxs[chunk] != RT_INVALID_SLOT_IDX;
}

static inline RT_PTR_ALLOC *
RT_NODE_48_GET_CHILD(RT_NODE_48 * node, uint8 chunk)
{
	return &node->children[node->slot_idxs[chunk]];
}

/* Return true if there is an entry for "chunk" */
static inline bool
RT_NODE_256_IS_CHUNK_USED(RT_NODE_256 * node, uint8 chunk)
{
	int			idx = RT_BM_IDX(chunk);
	int			bitnum = RT_BM_BIT(chunk);

	return (node->isset[idx] & ((bitmapword) 1 << bitnum)) != 0;
}

static inline RT_PTR_ALLOC *
RT_NODE_256_GET_CHILD(RT_NODE_256 * node, uint8 chunk)
{
	Assert(RT_NODE_256_IS_CHUNK_USED(node, chunk));
	return &node->children[chunk];
}

/*
 * Return the smallest shift that will allowing storing the given key.
 */
static inline int
RT_KEY_GET_SHIFT(uint64 key)
{
	if (key == 0)
		return 0;
	else
		return (pg_leftmost_one_pos64(key) / RT_SPAN) * RT_SPAN;
}

/*
 * Return the max value that can be stored in the tree with the given shift.
 */
static uint64
RT_SHIFT_GET_MAX_VAL(int shift)
{
	if (shift == RT_MAX_SHIFT)
		return UINT64_MAX;
	else
		return (UINT64CONST(1) << (shift + RT_SPAN)) - 1;
}

/*
 * Allocate a new node with the given node kind and size class.
 */
static inline RT_CHILD_PTR
RT_ALLOC_NODE(RT_RADIX_TREE * tree, const uint8 kind, const RT_SIZE_CLASS size_class)
{
	RT_CHILD_PTR allocnode;
	RT_NODE    *node;
	size_t		allocsize;

	allocsize = RT_SIZE_CLASS_INFO[size_class].allocsize;

#ifdef RT_SHMEM
	allocnode.alloc = dsa_allocate(tree->dsa, allocsize);
#else
	allocnode.alloc = (RT_PTR_ALLOC) MemoryContextAlloc(tree->node_slabs[size_class],
														allocsize);
#endif

	RT_PTR_SET_LOCAL(tree, &allocnode);
	node = allocnode.local;

	/* initialize contents */

	memset(node, 0, sizeof(RT_NODE));
	switch (kind)
	{
		case RT_NODE_KIND_4:
		case RT_NODE_KIND_16:
			break;
		case RT_NODE_KIND_48:
			{
				RT_NODE_48 *n48 = (RT_NODE_48 *) node;

				memset(n48->isset, 0, sizeof(n48->isset));
				memset(n48->slot_idxs, RT_INVALID_SLOT_IDX, sizeof(n48->slot_idxs));
				break;
			}
		case RT_NODE_KIND_256:
			{
				RT_NODE_256 *n256 = (RT_NODE_256 *) node;

				memset(n256->isset, 0, sizeof(n256->isset));
				break;
			}
		default:
			pg_unreachable();
	}

	node->kind = kind;

	/*
	 * For node256, this will actually overflow to zero, but that's okay
	 * because that node doesn't need to introspect this value.
	 */
	node->fanout = RT_SIZE_CLASS_INFO[size_class].fanout;

#ifdef RT_DEBUG
	/* update the statistics */
	tree->ctl->num_nodes[size_class]++;
#endif

	return allocnode;
}

/*
 * Allocate a new leaf.
 */
static RT_CHILD_PTR
RT_ALLOC_LEAF(RT_RADIX_TREE * tree, size_t allocsize)
{
	RT_CHILD_PTR leaf;

#ifdef RT_SHMEM
	leaf.alloc = dsa_allocate(tree->dsa, allocsize);
	RT_PTR_SET_LOCAL(tree, &leaf);
#else
	leaf.alloc = (RT_PTR_ALLOC) MemoryContextAlloc(tree->leaf_context, allocsize);
#endif

#ifdef RT_DEBUG
	tree->ctl->num_leaves++;
#endif

	return leaf;
}

/*
 * Copy relevant members of the node header.
 * This is a separate function in case other fields are added.
 */
static inline void
RT_COPY_COMMON(RT_CHILD_PTR newnode, RT_CHILD_PTR oldnode)
{
	(newnode.local)->count = (oldnode.local)->count;
}

/* Free the given node */
static void
RT_FREE_NODE(RT_RADIX_TREE * tree, RT_CHILD_PTR node)
{
#ifdef RT_DEBUG
	int			i;

	/* update the statistics */

	for (i = 0; i < RT_NUM_SIZE_CLASSES; i++)
	{
		if ((node.local)->fanout == RT_SIZE_CLASS_INFO[i].fanout)
			break;
	}

	/*
	 * The fanout of node256 will appear to be zero within the node header
	 * because of overflow, so we need an extra check here.
	 */
	if (i == RT_NUM_SIZE_CLASSES)
		i = RT_CLASS_256;

	tree->ctl->num_nodes[i]--;
	Assert(tree->ctl->num_nodes[i] >= 0);
#endif

#ifdef RT_SHMEM
	dsa_free(tree->dsa, node.alloc);
#else
	pfree(node.alloc);
#endif
}

static inline void
RT_FREE_LEAF(RT_RADIX_TREE * tree, RT_PTR_ALLOC leaf)
{
	Assert(leaf != tree->ctl->root);

#ifdef RT_DEBUG
	/* update the statistics */
	tree->ctl->num_leaves--;
	Assert(tree->ctl->num_leaves >= 0);
#endif

#ifdef RT_SHMEM
	dsa_free(tree->dsa, leaf);
#else
	pfree(leaf);
#endif
}

/***************** SEARCH *****************/

/*
 * Return the address of the child corresponding to "chunk",
 * or NULL if there is no such element.
 */
static inline RT_PTR_ALLOC *
RT_NODE_16_SEARCH_EQ(RT_NODE_16 * node, uint8 chunk)
{
	int			count = node->base.count;
#ifndef USE_NO_SIMD
	Vector8		spread_chunk;
	Vector8		haystack1;
	Vector8		haystack2;
	Vector8		cmp1;
	Vector8		cmp2;
	uint32		bitfield;
	RT_PTR_ALLOC *slot_simd = NULL;
#endif

#if defined(USE_NO_SIMD) || defined(USE_ASSERT_CHECKING)
	RT_PTR_ALLOC *slot = NULL;

	for (int i = 0; i < count; i++)
	{
		if (node->chunks[i] == chunk)
		{
			slot = &node->children[i];
			break;
		}
	}
#endif

#ifndef USE_NO_SIMD
	/* replicate the search key */
	spread_chunk = vector8_broadcast(chunk);

	/* compare to all 32 keys stored in the node */
	vector8_load(&haystack1, &node->chunks[0]);
	vector8_load(&haystack2, &node->chunks[sizeof(Vector8)]);
	cmp1 = vector8_eq(spread_chunk, haystack1);
	cmp2 = vector8_eq(spread_chunk, haystack2);

	/* convert comparison to a bitfield */
	bitfield = vector8_highbit_mask(cmp1) | (vector8_highbit_mask(cmp2) << sizeof(Vector8));

	/* mask off invalid entries */
	bitfield &= ((UINT64CONST(1) << count) - 1);

	/* convert bitfield to index by counting trailing zeros */
	if (bitfield)
		slot_simd = &node->children[pg_rightmost_one_pos32(bitfield)];

	Assert(slot_simd == slot);
	return slot_simd;
#else
	return slot;
#endif
}

/*
 * Search for the child pointer corresponding to "key" in the given node.
 *
 * Return child if the key is found, otherwise return NULL.
 */
static inline RT_PTR_ALLOC *
RT_NODE_SEARCH(RT_NODE * node, uint8 chunk)
{
	/* Make sure we already converted to local pointer */
	Assert(node != NULL);

	switch (node->kind)
	{
		case RT_NODE_KIND_4:
			{
				RT_NODE_4  *n4 = (RT_NODE_4 *) node;

				for (int i = 0; i < n4->base.count; i++)
				{
					if (n4->chunks[i] == chunk)
						return &n4->children[i];
				}
				return NULL;
			}
		case RT_NODE_KIND_16:
			return RT_NODE_16_SEARCH_EQ((RT_NODE_16 *) node, chunk);
		case RT_NODE_KIND_48:
			{
				RT_NODE_48 *n48 = (RT_NODE_48 *) node;
				int			slotpos = n48->slot_idxs[chunk];

				if (slotpos == RT_INVALID_SLOT_IDX)
					return NULL;

				return RT_NODE_48_GET_CHILD(n48, chunk);
			}
		case RT_NODE_KIND_256:
			{
				RT_NODE_256 *n256 = (RT_NODE_256 *) node;

				if (!RT_NODE_256_IS_CHUNK_USED(n256, chunk))
					return NULL;

				return RT_NODE_256_GET_CHILD(n256, chunk);
			}
		default:
			pg_unreachable();
	}
}

/*
 * Search the given key in the radix tree. Return the pointer to the value if found,
 * otherwise return NULL.
 *
 * Since the function returns a pointer (to support variable-length values),
 * the caller is responsible for locking until it's finished with the value.
 */
RT_SCOPE	RT_VALUE_TYPE *
RT_FIND(RT_RADIX_TREE * tree, uint64 key)
{
	RT_CHILD_PTR node;
	RT_PTR_ALLOC *slot = NULL;
	int			shift;

#ifdef RT_SHMEM
	Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
#endif

	if (key > tree->ctl->max_val)
		return NULL;

	Assert(RT_PTR_ALLOC_IS_VALID(tree->ctl->root));
	node.alloc = tree->ctl->root;
	shift = tree->ctl->start_shift;

	/* Descend the tree */
	while (shift >= 0)
	{
		RT_PTR_SET_LOCAL(tree, &node);
		slot = RT_NODE_SEARCH(node.local, RT_GET_KEY_CHUNK(key, shift));
		if (slot == NULL)
			return NULL;

		node.alloc = *slot;
		shift -= RT_SPAN;
	}

	if (RT_CHILDPTR_IS_VALUE(*slot))
		return (RT_VALUE_TYPE *) slot;
	else
	{
		RT_PTR_SET_LOCAL(tree, &node);
		return (RT_VALUE_TYPE *) node.local;
	}
}

/***************** INSERTION *****************/

#define RT_NODE_MUST_GROW(node) \
	((node)->count == (node)->fanout)

/*
 * Return index of the chunk and slot arrays for inserting into the node,
 * such that the arrays remain ordered.
 */
static inline int
RT_NODE_4_GET_INSERTPOS(RT_NODE_4 * node, uint8 chunk, int count)
{
	int			idx;

	for (idx = 0; idx < count; idx++)
	{
		if (node->chunks[idx] >= chunk)
			break;
	}

	return idx;
}

/*
 * Return index of the chunk and slot arrays for inserting into the node,
 * such that the arrays remain ordered.
 */
static inline int
RT_NODE_16_GET_INSERTPOS(RT_NODE_16 * node, uint8 chunk)
{
	int			count = node->base.count;
#if defined(USE_NO_SIMD) || defined(USE_ASSERT_CHECKING)
	int			index;
#endif

#ifndef USE_NO_SIMD
	Vector8		spread_chunk;
	Vector8		haystack1;
	Vector8		haystack2;
	Vector8		cmp1;
	Vector8		cmp2;
	Vector8		min1;
	Vector8		min2;
	uint32		bitfield;
	int			index_simd;
#endif

	/*
	 * First compare the last element. There are two reasons to branch here:
	 *
	 * 1) A realistic pattern is inserting ordered keys. In that case,
	 * non-SIMD platforms must do a linear search to the last chunk to find
	 * the insert position. This will get slower as the node fills up.
	 *
	 * 2) On SIMD platforms, we must branch anyway to make sure we don't bit
	 * scan an empty bitfield. Doing the branch here eliminates some work that
	 * we might otherwise throw away.
	 */
	Assert(count > 0);
	if (node->chunks[count - 1] < chunk)
		return count;

#if defined(USE_NO_SIMD) || defined(USE_ASSERT_CHECKING)

	for (index = 0; index < count; index++)
	{
		if (node->chunks[index] > chunk)
			break;
	}
#endif

#ifndef USE_NO_SIMD

	/*
	 * This is a bit more complicated than RT_NODE_16_SEARCH_EQ(), because no
	 * unsigned uint8 comparison instruction exists, at least for SSE2. So we
	 * need to play some trickery using vector8_min() to effectively get >=.
	 * There'll never be any equal elements in current uses, but that's what
	 * we get here...
	 */
	spread_chunk = vector8_broadcast(chunk);
	vector8_load(&haystack1, &node->chunks[0]);
	vector8_load(&haystack2, &node->chunks[sizeof(Vector8)]);
	min1 = vector8_min(spread_chunk, haystack1);
	min2 = vector8_min(spread_chunk, haystack2);
	cmp1 = vector8_eq(spread_chunk, min1);
	cmp2 = vector8_eq(spread_chunk, min2);
	bitfield = vector8_highbit_mask(cmp1) | (vector8_highbit_mask(cmp2) << sizeof(Vector8));

	Assert((bitfield & ((UINT64CONST(1) << count) - 1)) != 0);
	index_simd = pg_rightmost_one_pos32(bitfield);

	Assert(index_simd == index);
	return index_simd;
#else
	return index;
#endif
}

/* Shift the elements right at "insertpos" by one */
static inline void
RT_SHIFT_ARRAYS_FOR_INSERT(uint8 *chunks, RT_PTR_ALLOC * children, int count, int insertpos)
{
	/*
	 * This is basically a memmove, but written in a simple loop for speed on
	 * small inputs.
	 */
	for (int i = count - 1; i >= insertpos; i--)
	{
		/* workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101481 */
#ifdef __GNUC__
		__asm__("");
#endif
		chunks[i + 1] = chunks[i];
		children[i + 1] = children[i];
	}
}

/*
 * Copy both chunk and slot arrays into the right
 * place. The caller is responsible for inserting the new element.
 */
static inline void
RT_COPY_ARRAYS_FOR_INSERT(uint8 *dst_chunks, RT_PTR_ALLOC * dst_children,
						  uint8 *src_chunks, RT_PTR_ALLOC * src_children,
						  int count, int insertpos)
{
	for (int i = 0; i < count; i++)
	{
		int			sourceidx = i;

		/* use a branch-free computation to skip the index of the new element */
		int			destidx = i + (i >= insertpos);

		dst_chunks[destidx] = src_chunks[sourceidx];
		dst_children[destidx] = src_children[sourceidx];
	}
}

static inline RT_PTR_ALLOC *
RT_ADD_CHILD_256(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk)
{
	RT_NODE_256 *n256 = (RT_NODE_256 *) node.local;
	int			idx = RT_BM_IDX(chunk);
	int			bitnum = RT_BM_BIT(chunk);

	/* Mark the slot used for "chunk" */
	n256->isset[idx] |= ((bitmapword) 1 << bitnum);

	n256->base.count++;
	RT_VERIFY_NODE((RT_NODE *) n256);

	return RT_NODE_256_GET_CHILD(n256, chunk);
}

static pg_noinline RT_PTR_ALLOC *
RT_GROW_NODE_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node,
				uint8 chunk)
{
	RT_NODE_48 *n48 = (RT_NODE_48 *) node.local;
	RT_CHILD_PTR newnode;
	RT_NODE_256 *new256;
	int			i = 0;

	/* initialize new node */
	newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_256, RT_CLASS_256);
	new256 = (RT_NODE_256 *) newnode.local;

	/* copy over the entries */
	RT_COPY_COMMON(newnode, node);
	for (int word_num = 0; word_num < RT_BM_IDX(RT_NODE_MAX_SLOTS); word_num++)
	{
		bitmapword	bitmap = 0;

		/*
		 * Bit manipulation is a surprisingly large portion of the overhead in
		 * the naive implementation. Doing stores word-at-a-time removes a lot
		 * of that overhead.
		 */
		for (int bit = 0; bit < BITS_PER_BITMAPWORD; bit++)
		{
			uint8		offset = n48->slot_idxs[i];

			if (offset != RT_INVALID_SLOT_IDX)
			{
				bitmap |= ((bitmapword) 1 << bit);
				new256->children[i] = n48->children[offset];
			}

			i++;
		}

		new256->isset[word_num] = bitmap;
	}

	/* free old node and update reference in parent */
	*parent_slot = newnode.alloc;
	RT_FREE_NODE(tree, node);

	return RT_ADD_CHILD_256(tree, newnode, chunk);
}

static inline RT_PTR_ALLOC *
RT_ADD_CHILD_48(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk)
{
	RT_NODE_48 *n48 = (RT_NODE_48 *) node.local;
	int			insertpos;
	int			idx = 0;
	bitmapword	w,
				inverse;

	/* get the first word with at least one bit not set */
	for (int i = 0; i < RT_BM_IDX(RT_FANOUT_48_MAX); i++)
	{
		w = n48->isset[i];
		if (w < ~((bitmapword) 0))
		{
			idx = i;
			break;
		}
	}

	/* To get the first unset bit in w, get the first set bit in ~w */
	inverse = ~w;
	insertpos = idx * BITS_PER_BITMAPWORD;
	insertpos += bmw_rightmost_one_pos(inverse);
	Assert(insertpos < n48->base.fanout);

	/* mark the slot used by setting the rightmost zero bit */
	n48->isset[idx] |= w + 1;

	/* insert new chunk into place */
	n48->slot_idxs[chunk] = insertpos;

	n48->base.count++;
	RT_VERIFY_NODE((RT_NODE *) n48);

	return &n48->children[insertpos];
}

static pg_noinline RT_PTR_ALLOC *
RT_GROW_NODE_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node,
				uint8 chunk)
{
	RT_NODE_16 *n16 = (RT_NODE_16 *) node.local;
	int			insertpos;

	if (n16->base.fanout < RT_FANOUT_16_HI)
	{
		RT_CHILD_PTR newnode;
		RT_NODE_16 *new16;

		Assert(n16->base.fanout == RT_FANOUT_16_LO);

		/* initialize new node */
		newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_16, RT_CLASS_16_HI);
		new16 = (RT_NODE_16 *) newnode.local;

		/* copy over existing entries */
		RT_COPY_COMMON(newnode, node);
		Assert(n16->base.count == RT_FANOUT_16_LO);
		insertpos = RT_NODE_16_GET_INSERTPOS(n16, chunk);
		RT_COPY_ARRAYS_FOR_INSERT(new16->chunks, new16->children,
								  n16->chunks, n16->children,
								  RT_FANOUT_16_LO, insertpos);

		/* insert new chunk into place */
		new16->chunks[insertpos] = chunk;

		new16->base.count++;
		RT_VERIFY_NODE((RT_NODE *) new16);

		/* free old node and update references */
		RT_FREE_NODE(tree, node);
		*parent_slot = newnode.alloc;

		return &new16->children[insertpos];
	}
	else
	{
		RT_CHILD_PTR newnode;
		RT_NODE_48 *new48;
		int			idx,
					bit;

		Assert(n16->base.fanout == RT_FANOUT_16_HI);

		/* initialize new node */
		newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_48, RT_CLASS_48);
		new48 = (RT_NODE_48 *) newnode.local;

		/* copy over the entries */
		RT_COPY_COMMON(newnode, node);
		for (int i = 0; i < RT_FANOUT_16_HI; i++)
			new48->slot_idxs[n16->chunks[i]] = i;
		memcpy(&new48->children[0], &n16->children[0], RT_FANOUT_16_HI * sizeof(new48->children[0]));

		/*
		 * Since we just copied a dense array, we can fill "isset" using a
		 * single store, provided the length of that array is at most the
		 * number of bits in a bitmapword.
		 */
		Assert(RT_FANOUT_16_HI <= BITS_PER_BITMAPWORD);
		new48->isset[0] = (bitmapword) (((uint64) 1 << RT_FANOUT_16_HI) - 1);

		/* put the new child at the end of the copied entries */
		insertpos = RT_FANOUT_16_HI;
		idx = RT_BM_IDX(insertpos);
		bit = RT_BM_BIT(insertpos);

		/* mark the slot used */
		new48->isset[idx] |= ((bitmapword) 1 << bit);

		/* insert new chunk into place */
		new48->slot_idxs[chunk] = insertpos;

		new48->base.count++;
		RT_VERIFY_NODE((RT_NODE *) new48);

		/* free old node and update reference in parent */
		*parent_slot = newnode.alloc;
		RT_FREE_NODE(tree, node);

		return &new48->children[insertpos];
	}
}

static inline RT_PTR_ALLOC *
RT_ADD_CHILD_16(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk)
{
	RT_NODE_16 *n16 = (RT_NODE_16 *) node.local;
	int			insertpos = RT_NODE_16_GET_INSERTPOS(n16, chunk);

	/* shift chunks and children */
	RT_SHIFT_ARRAYS_FOR_INSERT(n16->chunks, n16->children,
							   n16->base.count, insertpos);

	/* insert new chunk into place */
	n16->chunks[insertpos] = chunk;

	n16->base.count++;
	RT_VERIFY_NODE((RT_NODE *) n16);

	return &n16->children[insertpos];
}

static pg_noinline RT_PTR_ALLOC *
RT_GROW_NODE_4(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node,
			   uint8 chunk)
{
	RT_NODE_4  *n4 = (RT_NODE_4 *) (node.local);
	RT_CHILD_PTR newnode;
	RT_NODE_16 *new16;
	int			insertpos;

	/* initialize new node */
	newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_16, RT_CLASS_16_LO);
	new16 = (RT_NODE_16 *) newnode.local;

	/* copy over existing entries */
	RT_COPY_COMMON(newnode, node);
	Assert(n4->base.count == RT_FANOUT_4);
	insertpos = RT_NODE_4_GET_INSERTPOS(n4, chunk, RT_FANOUT_4);
	RT_COPY_ARRAYS_FOR_INSERT(new16->chunks, new16->children,
							  n4->chunks, n4->children,
							  RT_FANOUT_4, insertpos);

	/* insert new chunk into place */
	new16->chunks[insertpos] = chunk;

	new16->base.count++;
	RT_VERIFY_NODE((RT_NODE *) new16);

	/* free old node and update reference in parent */
	*parent_slot = newnode.alloc;
	RT_FREE_NODE(tree, node);

	return &new16->children[insertpos];
}

static inline RT_PTR_ALLOC *
RT_ADD_CHILD_4(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk)
{
	RT_NODE_4  *n4 = (RT_NODE_4 *) (node.local);
	int			count = n4->base.count;
	int			insertpos = RT_NODE_4_GET_INSERTPOS(n4, chunk, count);

	/* shift chunks and children */
	RT_SHIFT_ARRAYS_FOR_INSERT(n4->chunks, n4->children,
							   count, insertpos);

	/* insert new chunk into place */
	n4->chunks[insertpos] = chunk;

	n4->base.count++;
	RT_VERIFY_NODE((RT_NODE *) n4);

	return &n4->children[insertpos];
}

/*
 * Reserve slot in "node"'s child array. The caller will populate it
 * with the actual child pointer.
 *
 * "parent_slot" is the address of the parent's child pointer to "node".
 * If the node we're inserting into needs to grow, we update the parent's
 * child pointer with the pointer to the new larger node.
 */
static inline RT_PTR_ALLOC *
RT_NODE_INSERT(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node,
			   uint8 chunk)
{
	RT_NODE    *n = node.local;

	switch (n->kind)
	{
		case RT_NODE_KIND_4:
			{
				if (unlikely(RT_NODE_MUST_GROW(n)))
					return RT_GROW_NODE_4(tree, parent_slot, node, chunk);

				return RT_ADD_CHILD_4(tree, node, chunk);
			}
		case RT_NODE_KIND_16:
			{
				if (unlikely(RT_NODE_MUST_GROW(n)))
					return RT_GROW_NODE_16(tree, parent_slot, node, chunk);

				return RT_ADD_CHILD_16(tree, node, chunk);
			}
		case RT_NODE_KIND_48:
			{
				if (unlikely(RT_NODE_MUST_GROW(n)))
					return RT_GROW_NODE_48(tree, parent_slot, node, chunk);

				return RT_ADD_CHILD_48(tree, node, chunk);
			}
		case RT_NODE_KIND_256:
			return RT_ADD_CHILD_256(tree, node, chunk);
		default:
			pg_unreachable();
	}
}

/*
 * The radix tree doesn't have sufficient height. Put new node(s) on top,
 * and move the old node below it.
 */
static pg_noinline void
RT_EXTEND_UP(RT_RADIX_TREE * tree, uint64 key)
{
	int			target_shift = RT_KEY_GET_SHIFT(key);
	int			shift = tree->ctl->start_shift;

	Assert(shift < target_shift);

	/* Grow tree upwards until start shift can accommodate the key */
	while (shift < target_shift)
	{
		RT_CHILD_PTR node;
		RT_NODE_4  *n4;

		node = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4);
		n4 = (RT_NODE_4 *) node.local;
		n4->base.count = 1;
		n4->chunks[0] = 0;
		n4->children[0] = tree->ctl->root;

		/* Update the root */
		tree->ctl->root = node.alloc;

		shift += RT_SPAN;
	}

	tree->ctl->max_val = RT_SHIFT_GET_MAX_VAL(target_shift);
	tree->ctl->start_shift = target_shift;
}

/*
 * Insert a chain of nodes until we reach the lowest level,
 * and return the address of a slot to be filled further up
 * the call stack.
 */
static pg_noinline RT_PTR_ALLOC *
RT_EXTEND_DOWN(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, uint64 key, int shift)
{
	RT_CHILD_PTR node,
				child;
	RT_NODE_4  *n4;

	/*
	 * The child pointer of the first node in the chain goes in the
	 * caller-provided slot.
	 */
	child = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4);
	*parent_slot = child.alloc;

	node = child;
	shift -= RT_SPAN;

	while (shift > 0)
	{
		child = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4);

		/* We open-code the insertion ourselves, for speed. */
		n4 = (RT_NODE_4 *) node.local;
		n4->base.count = 1;
		n4->chunks[0] = RT_GET_KEY_CHUNK(key, shift);
		n4->children[0] = child.alloc;

		node = child;
		shift -= RT_SPAN;
	}
	Assert(shift == 0);

	/* Reserve slot for the value. */
	n4 = (RT_NODE_4 *) node.local;
	n4->chunks[0] = RT_GET_KEY_CHUNK(key, 0);
	n4->base.count = 1;

	return &n4->children[0];
}

/*
 * Workhorse for RT_SET
 *
 * "parent_slot" is the address of the child pointer we just followed,
 * in the parent's array of children, needed if inserting into the
 * current node causes it to grow.
 */
static RT_PTR_ALLOC *
RT_GET_SLOT_RECURSIVE(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, uint64 key, int shift, bool *found)
{
	RT_PTR_ALLOC *slot;
	RT_CHILD_PTR node;
	uint8		chunk = RT_GET_KEY_CHUNK(key, shift);

	node.alloc = *parent_slot;
	RT_PTR_SET_LOCAL(tree, &node);
	slot = RT_NODE_SEARCH(node.local, chunk);

	if (slot == NULL)
	{
		*found = false;

		/* reserve slot for the caller to populate */

		slot = RT_NODE_INSERT(tree, parent_slot, node, chunk);

		if (shift == 0)
			return slot;
		else
			return RT_EXTEND_DOWN(tree, slot, key, shift);
	}
	else
	{
		if (shift == 0)
		{
			*found = true;
			return slot;
		}
		else
			return RT_GET_SLOT_RECURSIVE(tree, slot, key, shift - RT_SPAN, found);
	}
}

/*
 * Set key to value that value_p points to. If the entry already exists, we
 * update its value and return true. Returns false if entry doesn't yet exist.
 *
 * Taking a lock in exclusive mode is the caller's responsibility.
 */
RT_SCOPE bool
RT_SET(RT_RADIX_TREE * tree, uint64 key, RT_VALUE_TYPE * value_p)
{
	bool		found;
	RT_PTR_ALLOC *slot;
	size_t		value_sz = RT_GET_VALUE_SIZE(value_p);

#ifdef RT_SHMEM
	Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
#endif

	Assert(RT_PTR_ALLOC_IS_VALID(tree->ctl->root));

	/* Extend the tree if necessary */
	if (unlikely(key > tree->ctl->max_val))
	{
		if (tree->ctl->num_keys == 0)
		{
			RT_CHILD_PTR node;
			RT_NODE_4  *n4;
			int			start_shift = RT_KEY_GET_SHIFT(key);

			/*
			 * With an empty root node, we don't extend the tree upwards,
			 * since that would result in orphan empty nodes. Instead we open
			 * code inserting into the root node and extend downward from
			 * there.
			 */
			node.alloc = tree->ctl->root;
			RT_PTR_SET_LOCAL(tree, &node);
			n4 = (RT_NODE_4 *) node.local;
			n4->base.count = 1;
			n4->chunks[0] = RT_GET_KEY_CHUNK(key, start_shift);

			slot = RT_EXTEND_DOWN(tree, &n4->children[0], key, start_shift);
			found = false;
			tree->ctl->start_shift = start_shift;
			tree->ctl->max_val = RT_SHIFT_GET_MAX_VAL(start_shift);
			goto have_slot;
		}
		else
			RT_EXTEND_UP(tree, key);
	}

	slot = RT_GET_SLOT_RECURSIVE(tree, &tree->ctl->root,
								 key, tree->ctl->start_shift, &found);

have_slot:
	Assert(slot != NULL);

	if (RT_VALUE_IS_EMBEDDABLE(value_p))
	{
		/* free the existing leaf */
		if (found && !RT_CHILDPTR_IS_VALUE(*slot))
			RT_FREE_LEAF(tree, *slot);

		/* store value directly in child pointer slot */
		memcpy(slot, value_p, value_sz);

#ifdef RT_RUNTIME_EMBEDDABLE_VALUE
		/* tag child pointer */
#ifdef RT_SHMEM
		*slot |= 1;
#else
		*((uintptr_t *) slot) |= 1;
#endif
#endif
	}
	else
	{
		RT_CHILD_PTR leaf;

		if (found && !RT_CHILDPTR_IS_VALUE(*slot))
		{
			Assert(RT_PTR_ALLOC_IS_VALID(*slot));
			leaf.alloc = *slot;
			RT_PTR_SET_LOCAL(tree, &leaf);

			if (RT_GET_VALUE_SIZE((RT_VALUE_TYPE *) leaf.local) != value_sz)
			{
				/*
				 * different sizes, so first free the existing leaf before
				 * allocating a new one
				 */
				RT_FREE_LEAF(tree, *slot);
				leaf = RT_ALLOC_LEAF(tree, value_sz);
				*slot = leaf.alloc;
			}
		}
		else
		{
			/* allocate new leaf and store it in the child array */
			leaf = RT_ALLOC_LEAF(tree, value_sz);
			*slot = leaf.alloc;
		}

		memcpy(leaf.local, value_p, value_sz);
	}

	/* Update the statistics */
	if (!found)
		tree->ctl->num_keys++;

	return found;
}

/***************** SETUP / TEARDOWN *****************/

/*
 * Create the radix tree in the given memory context and return it.
 *
 * All local memory required for a radix tree is allocated in the given
 * memory context and its children. Note that RT_FREE() will delete all
 * allocated space within the given memory context, so the dsa_area should
 * be created in a different context.
 */
RT_SCOPE	RT_RADIX_TREE *
#ifdef RT_SHMEM
RT_CREATE(MemoryContext ctx, dsa_area *dsa, int tranche_id)
#else
RT_CREATE(MemoryContext ctx)
#endif
{
	RT_RADIX_TREE *tree;
	MemoryContext old_ctx;
	RT_CHILD_PTR rootnode;
#ifdef RT_SHMEM
	dsa_pointer dp;
#endif

	old_ctx = MemoryContextSwitchTo(ctx);

	tree = (RT_RADIX_TREE *) palloc0(sizeof(RT_RADIX_TREE));
	tree->context = ctx;

	/*
	 * Separate context for iteration in case the tree context doesn't support
	 * pfree
	 */
	tree->iter_context = AllocSetContextCreate(ctx,
											   RT_STR(RT_PREFIX) "_radix_tree iter context",
											   ALLOCSET_SMALL_SIZES);

#ifdef RT_SHMEM
	tree->dsa = dsa;
	dp = dsa_allocate0(dsa, sizeof(RT_RADIX_TREE_CONTROL));
	tree->ctl = (RT_RADIX_TREE_CONTROL *) dsa_get_address(dsa, dp);
	tree->ctl->handle = dp;
	tree->ctl->magic = RT_RADIX_TREE_MAGIC;
	LWLockInitialize(&tree->ctl->lock, tranche_id);
#else
	tree->ctl = (RT_RADIX_TREE_CONTROL *) palloc0(sizeof(RT_RADIX_TREE_CONTROL));

	/* Create a slab context for each size class */
	for (int i = 0; i < RT_NUM_SIZE_CLASSES; i++)
	{
		RT_SIZE_CLASS_ELEM size_class = RT_SIZE_CLASS_INFO[i];
		size_t		inner_blocksize = RT_SLAB_BLOCK_SIZE(size_class.allocsize);

		tree->node_slabs[i] = SlabContextCreate(ctx,
												size_class.name,
												inner_blocksize,
												size_class.allocsize);
	}

	/* By default we use the passed context for leaves. */
	tree->leaf_context = tree->context;

#ifndef RT_VARLEN_VALUE_SIZE

	/*
	 * For leaves storing fixed-length values, we use a slab context to avoid
	 * the possibility of space wastage by power-of-2 rounding up.
	 */
	if (sizeof(RT_VALUE_TYPE) > sizeof(RT_PTR_ALLOC))
		tree->leaf_context = SlabContextCreate(ctx,
											   RT_STR(RT_PREFIX) "_radix_tree leaf context",
											   RT_SLAB_BLOCK_SIZE(sizeof(RT_VALUE_TYPE)),
											   sizeof(RT_VALUE_TYPE));
#endif							/* !RT_VARLEN_VALUE_SIZE */
#endif							/* RT_SHMEM */

	/* add root node now so that RT_SET can assume it exists */
	rootnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4);
	tree->ctl->root = rootnode.alloc;
	tree->ctl->start_shift = 0;
	tree->ctl->max_val = RT_SHIFT_GET_MAX_VAL(0);

	MemoryContextSwitchTo(old_ctx);

	return tree;
}

#ifdef RT_SHMEM
RT_SCOPE	RT_RADIX_TREE *
RT_ATTACH(dsa_area *dsa, RT_HANDLE handle)
{
	RT_RADIX_TREE *tree;
	dsa_pointer control;

	tree = (RT_RADIX_TREE *) palloc0(sizeof(RT_RADIX_TREE));

	/* Find the control object in shared memory */
	control = handle;

	tree->dsa = dsa;
	tree->ctl = (RT_RADIX_TREE_CONTROL *) dsa_get_address(dsa, control);
	Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);

	return tree;
}

RT_SCOPE void
RT_DETACH(RT_RADIX_TREE * tree)
{
	Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
	pfree(tree);
}

RT_SCOPE	RT_HANDLE
RT_GET_HANDLE(RT_RADIX_TREE * tree)
{
	Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
	return tree->ctl->handle;
}

RT_SCOPE void
RT_LOCK_EXCLUSIVE(RT_RADIX_TREE * tree)
{
	Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
	LWLockAcquire(&tree->ctl->lock, LW_EXCLUSIVE);
}

RT_SCOPE void
RT_LOCK_SHARE(RT_RADIX_TREE * tree)
{
	Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
	LWLockAcquire(&tree->ctl->lock, LW_SHARED);
}

RT_SCOPE void
RT_UNLOCK(RT_RADIX_TREE * tree)
{
	Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
	LWLockRelease(&tree->ctl->lock);
}

/*
 * Recursively free all nodes allocated in the DSA area.
 */
static void
RT_FREE_RECURSE(RT_RADIX_TREE * tree, RT_PTR_ALLOC ptr, int shift)
{
	RT_CHILD_PTR node;

	check_stack_depth();

	node.alloc = ptr;
	RT_PTR_SET_LOCAL(tree, &node);

	switch (node.local->kind)
	{
		case RT_NODE_KIND_4:
			{
				RT_NODE_4  *n4 = (RT_NODE_4 *) node.local;

				for (int i = 0; i < n4->base.count; i++)
				{
					RT_PTR_ALLOC child = n4->children[i];

					if (shift > 0)
						RT_FREE_RECURSE(tree, child, shift - RT_SPAN);
					else if (!RT_CHILDPTR_IS_VALUE(child))
						dsa_free(tree->dsa, child);
				}

				break;
			}
		case RT_NODE_KIND_16:
			{
				RT_NODE_16 *n16 = (RT_NODE_16 *) node.local;

				for (int i = 0; i < n16->base.count; i++)
				{
					RT_PTR_ALLOC child = n16->children[i];

					if (shift > 0)
						RT_FREE_RECURSE(tree, child, shift - RT_SPAN);
					else if (!RT_CHILDPTR_IS_VALUE(child))
						dsa_free(tree->dsa, child);
				}

				break;
			}
		case RT_NODE_KIND_48:
			{
				RT_NODE_48 *n48 = (RT_NODE_48 *) node.local;

				for (int i = 0; i < RT_NODE_MAX_SLOTS; i++)
				{
					RT_PTR_ALLOC child;

					if (!RT_NODE_48_IS_CHUNK_USED(n48, i))
						continue;

					child = *RT_NODE_48_GET_CHILD(n48, i);

					if (shift > 0)
						RT_FREE_RECURSE(tree, child, shift - RT_SPAN);
					else if (!RT_CHILDPTR_IS_VALUE(child))
						dsa_free(tree->dsa, child);
				}

				break;
			}
		case RT_NODE_KIND_256:
			{
				RT_NODE_256 *n256 = (RT_NODE_256 *) node.local;

				for (int i = 0; i < RT_NODE_MAX_SLOTS; i++)
				{
					RT_PTR_ALLOC child;

					if (!RT_NODE_256_IS_CHUNK_USED(n256, i))
						continue;

					child = *RT_NODE_256_GET_CHILD(n256, i);

					if (shift > 0)
						RT_FREE_RECURSE(tree, child, shift - RT_SPAN);
					else if (!RT_CHILDPTR_IS_VALUE(child))
						dsa_free(tree->dsa, child);
				}

				break;
			}
	}

	/* Free the inner node */
	dsa_free(tree->dsa, ptr);
}
#endif

/*
 * Free the radix tree, including all nodes and leaves.
 */
RT_SCOPE void
RT_FREE(RT_RADIX_TREE * tree)
{
#ifdef RT_SHMEM
	Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);

	/* Free all memory used for radix tree nodes */
	Assert(RT_PTR_ALLOC_IS_VALID(tree->ctl->root));
	RT_FREE_RECURSE(tree, tree->ctl->root, tree->ctl->start_shift);

	/*
	 * Vandalize the control block to help catch programming error where other
	 * backends access the memory formerly occupied by this radix tree.
	 */
	tree->ctl->magic = 0;
	dsa_free(tree->dsa, tree->ctl->handle);
#endif

	/*
	 * Free all space allocated within the tree's context and delete all child
	 * contexts such as those used for nodes.
	 */
	MemoryContextReset(tree->context);
}

/***************** ITERATION *****************/

/*
 * Create and return the iterator for the given radix tree.
 *
 * Taking a lock in shared mode during the iteration is the caller's
 * responsibility.
 */
RT_SCOPE	RT_ITER *
RT_BEGIN_ITERATE(RT_RADIX_TREE * tree)
{
	RT_ITER    *iter;
	RT_CHILD_PTR root;

	iter = (RT_ITER *) MemoryContextAllocZero(tree->iter_context,
											  sizeof(RT_ITER));
	iter->tree = tree;

	Assert(RT_PTR_ALLOC_IS_VALID(tree->ctl->root));
	root.alloc = iter->tree->ctl->root;
	RT_PTR_SET_LOCAL(tree, &root);

	iter->top_level = iter->tree->ctl->start_shift / RT_SPAN;

	/* Set the root to start */
	iter->cur_level = iter->top_level;
	iter->node_iters[iter->cur_level].node = root;
	iter->node_iters[iter->cur_level].idx = 0;

	return iter;
}

/*
 * Scan the inner node and return the next child pointer if one exists, otherwise
 * return NULL.
 */
static inline RT_PTR_ALLOC *
RT_NODE_ITERATE_NEXT(RT_ITER * iter, int level)
{
	uint8		key_chunk = 0;
	RT_NODE_ITER *node_iter;
	RT_CHILD_PTR node;
	RT_PTR_ALLOC *slot = NULL;

#ifdef RT_SHMEM
	Assert(iter->tree->ctl->magic == RT_RADIX_TREE_MAGIC);
#endif

	node_iter = &(iter->node_iters[level]);
	node = node_iter->node;

	Assert(node.local != NULL);

	switch ((node.local)->kind)
	{
		case RT_NODE_KIND_4:
			{
				RT_NODE_4  *n4 = (RT_NODE_4 *) (node.local);

				if (node_iter->idx >= n4->base.count)
					return NULL;

				slot = &n4->children[node_iter->idx];
				key_chunk = n4->chunks[node_iter->idx];
				node_iter->idx++;
				break;
			}
		case RT_NODE_KIND_16:
			{
				RT_NODE_16 *n16 = (RT_NODE_16 *) (node.local);

				if (node_iter->idx >= n16->base.count)
					return NULL;

				slot = &n16->children[node_iter->idx];
				key_chunk = n16->chunks[node_iter->idx];
				node_iter->idx++;
				break;
			}
		case RT_NODE_KIND_48:
			{
				RT_NODE_48 *n48 = (RT_NODE_48 *) (node.local);
				int			chunk;

				for (chunk = node_iter->idx; chunk < RT_NODE_MAX_SLOTS; chunk++)
				{
					if (RT_NODE_48_IS_CHUNK_USED(n48, chunk))
						break;
				}

				if (chunk >= RT_NODE_MAX_SLOTS)
					return NULL;

				slot = RT_NODE_48_GET_CHILD(n48, chunk);

				key_chunk = chunk;
				node_iter->idx = chunk + 1;
				break;
			}
		case RT_NODE_KIND_256:
			{
				RT_NODE_256 *n256 = (RT_NODE_256 *) (node.local);
				int			chunk;

				for (chunk = node_iter->idx; chunk < RT_NODE_MAX_SLOTS; chunk++)
				{
					if (RT_NODE_256_IS_CHUNK_USED(n256, chunk))
						break;
				}

				if (chunk >= RT_NODE_MAX_SLOTS)
					return NULL;

				slot = RT_NODE_256_GET_CHILD(n256, chunk);

				key_chunk = chunk;
				node_iter->idx = chunk + 1;
				break;
			}
	}

	/* Update the key */
	iter->key &= ~(((uint64) RT_CHUNK_MASK) << (level * RT_SPAN));
	iter->key |= (((uint64) key_chunk) << (level * RT_SPAN));

	return slot;
}

/*
 * Return pointer to value and set key_p as long as there is a key.  Otherwise
 * return NULL.
 */
RT_SCOPE	RT_VALUE_TYPE *
RT_ITERATE_NEXT(RT_ITER * iter, uint64 *key_p)
{
	RT_PTR_ALLOC *slot = NULL;

	while (iter->cur_level <= iter->top_level)
	{
		RT_CHILD_PTR node;

		slot = RT_NODE_ITERATE_NEXT(iter, iter->cur_level);

		if (iter->cur_level == 0 && slot != NULL)
		{
			/* Found a value at the leaf node */
			*key_p = iter->key;
			node.alloc = *slot;

			if (RT_CHILDPTR_IS_VALUE(*slot))
				return (RT_VALUE_TYPE *) slot;
			else
			{
				RT_PTR_SET_LOCAL(iter->tree, &node);
				return (RT_VALUE_TYPE *) node.local;
			}
		}

		if (slot != NULL)
		{
			/* Found the child slot, move down the tree */
			node.alloc = *slot;
			RT_PTR_SET_LOCAL(iter->tree, &node);

			iter->cur_level--;
			iter->node_iters[iter->cur_level].node = node;
			iter->node_iters[iter->cur_level].idx = 0;
		}
		else
		{
			/* Not found the child slot, move up the tree */
			iter->cur_level++;
		}
	}

	/* We've visited all nodes, so the iteration finished */
	return NULL;
}

/*
 * Terminate the iteration. The caller is responsible for releasing any locks.
 */
RT_SCOPE void
RT_END_ITERATE(RT_ITER * iter)
{
	pfree(iter);
}

/***************** DELETION *****************/

#ifdef RT_USE_DELETE

/* Delete the element at "deletepos" */
static inline void
RT_SHIFT_ARRAYS_AND_DELETE(uint8 *chunks, RT_PTR_ALLOC * children, int count, int deletepos)
{
	/*
	 * This is basically a memmove, but written in a simple loop for speed on
	 * small inputs.
	 */
	for (int i = deletepos; i < count - 1; i++)
	{
		/* workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101481 */
#ifdef __GNUC__
		__asm__("");
#endif
		chunks[i] = chunks[i + 1];
		children[i] = children[i + 1];
	}
}

/*
 * Copy both chunk and slot arrays into the right
 * place. The element at "deletepos" is deleted by skipping it.
 */
static inline void
RT_COPY_ARRAYS_AND_DELETE(uint8 *dst_chunks, RT_PTR_ALLOC * dst_children,
						  uint8 *src_chunks, RT_PTR_ALLOC * src_children,
						  int count, int deletepos)
{
	for (int i = 0; i < count - 1; i++)
	{
		/*
		 * use a branch-free computation to skip the index of the deleted
		 * element
		 */
		int			sourceidx = i + (i >= deletepos);
		int			destidx = i;

		dst_chunks[destidx] = src_chunks[sourceidx];
		dst_children[destidx] = src_children[sourceidx];
	}
}

/*
 * Note: While all node-growing functions are called to perform an insertion
 * when no more space is available, shrinking is not a hard-and-fast requirement.
 * When shrinking nodes, we generally wait until the count is about 3/4 of
 * the next lower node's fanout. This prevents ping-ponging between different
 * node sizes.
 *
 * Some shrinking functions delete first and then shrink, either because we
 * must or because it's fast and simple that way. Sometimes it's faster to
 * delete while shrinking.
 */

/*
 * Move contents of a node256 to a node48. Any deletion should have happened
 * in the caller.
 */
static void pg_noinline
RT_SHRINK_NODE_256(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk)
{
	RT_NODE_256 *n256 = (RT_NODE_256 *) node.local;
	RT_CHILD_PTR newnode;
	RT_NODE_48 *new48;
	int			slot_idx = 0;

	/* initialize new node */
	newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_48, RT_CLASS_48);
	new48 = (RT_NODE_48 *) newnode.local;

	/* copy over the entries */
	RT_COPY_COMMON(newnode, node);
	for (int i = 0; i < RT_NODE_MAX_SLOTS; i++)
	{
		if (RT_NODE_256_IS_CHUNK_USED(n256, i))
		{
			new48->slot_idxs[i] = slot_idx;
			new48->children[slot_idx] = n256->children[i];
			slot_idx++;
		}
	}

	/*
	 * Since we just copied a dense array, we can fill "isset" using a single
	 * store, provided the length of that array is at most the number of bits
	 * in a bitmapword.
	 */
	Assert(n256->base.count <= BITS_PER_BITMAPWORD);
	new48->isset[0] = (bitmapword) (((uint64) 1 << n256->base.count) - 1);

	/* free old node and update reference in parent */
	*parent_slot = newnode.alloc;
	RT_FREE_NODE(tree, node);
}

static inline void
RT_REMOVE_CHILD_256(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk)
{
	int			shrink_threshold;
	RT_NODE_256 *n256 = (RT_NODE_256 *) node.local;
	int			idx = RT_BM_IDX(chunk);
	int			bitnum = RT_BM_BIT(chunk);

	/* Mark the slot free for "chunk" */
	n256->isset[idx] &= ~((bitmapword) 1 << bitnum);

	n256->base.count--;

	/*
	 * A full node256 will have a count of zero because of overflow, so we
	 * delete first before checking the shrink threshold.
	 */
	Assert(n256->base.count > 0);

	/* This simplifies RT_SHRINK_NODE_256() */
	shrink_threshold = BITS_PER_BITMAPWORD;
	shrink_threshold = Min(RT_FANOUT_48 / 4 * 3, shrink_threshold);

	if (n256->base.count <= shrink_threshold)
		RT_SHRINK_NODE_256(tree, parent_slot, node, chunk);
}

/*
 * Move contents of a node48 to a node16. Any deletion should have happened
 * in the caller.
 */
static void pg_noinline
RT_SHRINK_NODE_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk)
{
	RT_NODE_48 *n48 = (RT_NODE_48 *) (node.local);
	RT_CHILD_PTR newnode;
	RT_NODE_16 *new16;
	int			destidx = 0;

	/*
	 * Initialize new node. For now we skip the larger node16 size class for
	 * simplicity.
	 */
	newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_16, RT_CLASS_16_LO);
	new16 = (RT_NODE_16 *) newnode.local;

	/* copy over all existing entries */
	RT_COPY_COMMON(newnode, node);
	for (int chunk = 0; chunk < RT_NODE_MAX_SLOTS; chunk++)
	{
		if (n48->slot_idxs[chunk] != RT_INVALID_SLOT_IDX)
		{
			new16->chunks[destidx] = chunk;
			new16->children[destidx] = n48->children[n48->slot_idxs[chunk]];
			destidx++;
		}
	}

	Assert(destidx < new16->base.fanout);

	RT_VERIFY_NODE((RT_NODE *) new16);

	/* free old node and update reference in parent */
	*parent_slot = newnode.alloc;
	RT_FREE_NODE(tree, node);
}

static inline void
RT_REMOVE_CHILD_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk)
{
	RT_NODE_48 *n48 = (RT_NODE_48 *) node.local;
	int			deletepos = n48->slot_idxs[chunk];

	/* For now we skip the larger node16 size class for simplicity */
	int			shrink_threshold = RT_FANOUT_16_LO / 4 * 3;
	int			idx;
	int			bitnum;

	Assert(deletepos != RT_INVALID_SLOT_IDX);

	idx = RT_BM_IDX(deletepos);
	bitnum = RT_BM_BIT(deletepos);
	n48->isset[idx] &= ~((bitmapword) 1 << bitnum);
	n48->slot_idxs[chunk] = RT_INVALID_SLOT_IDX;

	n48->base.count--;

	/*
	 * To keep shrinking simple, do it after deleting, which is fast for
	 * node48 anyway.
	 */
	if (n48->base.count <= shrink_threshold)
		RT_SHRINK_NODE_48(tree, parent_slot, node, chunk);
}

/*
 * Move contents of a node16 to a node4, and delete the one at "deletepos".
 * By deleting as we move, we can avoid memmove operations in the new
 * node.
 */
static void pg_noinline
RT_SHRINK_NODE_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 deletepos)
{
	RT_NODE_16 *n16 = (RT_NODE_16 *) (node.local);
	RT_CHILD_PTR newnode;
	RT_NODE_4  *new4;

	/* initialize new node */
	newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4);
	new4 = (RT_NODE_4 *) newnode.local;

	/* copy over existing entries, except for the one at "deletepos" */
	RT_COPY_COMMON(newnode, node);
	RT_COPY_ARRAYS_AND_DELETE(new4->chunks, new4->children,
							  n16->chunks, n16->children,
							  n16->base.count, deletepos);

	new4->base.count--;
	RT_VERIFY_NODE((RT_NODE *) new4);

	/* free old node and update reference in parent */
	*parent_slot = newnode.alloc;
	RT_FREE_NODE(tree, node);
}

static inline void
RT_REMOVE_CHILD_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk, RT_PTR_ALLOC * slot)
{
	RT_NODE_16 *n16 = (RT_NODE_16 *) node.local;
	int			deletepos = slot - n16->children;

	/*
	 * When shrinking to node4, 4 is hard-coded. After shrinking, the new node
	 * will end up with 3 elements and 3 is the largest count where linear
	 * search is faster than SIMD, at least on x86-64.
	 */
	if (n16->base.count <= 4)
	{
		RT_SHRINK_NODE_16(tree, parent_slot, node, deletepos);
		return;
	}

	Assert(deletepos >= 0);
	Assert(n16->chunks[deletepos] == chunk);

	RT_SHIFT_ARRAYS_AND_DELETE(n16->chunks, n16->children,
							   n16->base.count, deletepos);
	n16->base.count--;
}

static inline void
RT_REMOVE_CHILD_4(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk, RT_PTR_ALLOC * slot)
{
	RT_NODE_4  *n4 = (RT_NODE_4 *) node.local;

	if (n4->base.count == 1)
	{
		Assert(n4->chunks[0] == chunk);

		/*
		 * If we're deleting the last entry from the root child node don't
		 * free it, but mark both the tree and the root child node empty. That
		 * way, RT_SET can assume it exists.
		 */
		if (parent_slot == &tree->ctl->root)
		{
			n4->base.count = 0;
			tree->ctl->start_shift = 0;
			tree->ctl->max_val = RT_SHIFT_GET_MAX_VAL(0);
		}
		else
		{
			/*
			 * Deleting last entry, so just free the entire node.
			 * RT_DELETE_RECURSIVE has already freed the value and lower-level
			 * children.
			 */
			RT_FREE_NODE(tree, node);

			/*
			 * Also null out the parent's slot -- this tells the next higher
			 * level to delete its child pointer
			 */
			*parent_slot = RT_INVALID_PTR_ALLOC;
		}
	}
	else
	{
		int			deletepos = slot - n4->children;

		Assert(deletepos >= 0);
		Assert(n4->chunks[deletepos] == chunk);

		RT_SHIFT_ARRAYS_AND_DELETE(n4->chunks, n4->children,
								   n4->base.count, deletepos);

		n4->base.count--;
	}
}

/*
 * Delete the child pointer corresponding to "key" in the given node.
 */
static inline void
RT_NODE_DELETE(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk, RT_PTR_ALLOC * slot)
{
	switch ((node.local)->kind)
	{
		case RT_NODE_KIND_4:
			{
				RT_REMOVE_CHILD_4(tree, parent_slot, node, chunk, slot);
				return;
			}
		case RT_NODE_KIND_16:
			{
				RT_REMOVE_CHILD_16(tree, parent_slot, node, chunk, slot);
				return;
			}
		case RT_NODE_KIND_48:
			{
				RT_REMOVE_CHILD_48(tree, parent_slot, node, chunk);
				return;
			}
		case RT_NODE_KIND_256:
			{
				RT_REMOVE_CHILD_256(tree, parent_slot, node, chunk);
				return;
			}
		default:
			pg_unreachable();
	}
}

/* workhorse for RT_DELETE */
static bool
RT_DELETE_RECURSIVE(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, uint64 key, int shift)
{
	RT_PTR_ALLOC *slot;
	RT_CHILD_PTR node;
	uint8		chunk = RT_GET_KEY_CHUNK(key, shift);

	node.alloc = *parent_slot;
	RT_PTR_SET_LOCAL(tree, &node);
	slot = RT_NODE_SEARCH(node.local, chunk);

	if (slot == NULL)
		return false;

	if (shift == 0)
	{
		if (!RT_CHILDPTR_IS_VALUE(*slot))
			RT_FREE_LEAF(tree, *slot);

		RT_NODE_DELETE(tree, parent_slot, node, chunk, slot);
		return true;
	}
	else
	{
		bool		deleted;

		deleted = RT_DELETE_RECURSIVE(tree, slot, key, shift - RT_SPAN);

		/* Child node was freed, so delete its slot now */
		if (*slot == RT_INVALID_PTR_ALLOC)
		{
			Assert(deleted);
			RT_NODE_DELETE(tree, parent_slot, node, chunk, slot);
		}

		return deleted;
	}
}

/*
 * Delete the given key from the radix tree. If the key is found delete it
 * and return true, otherwise do nothing and return false.
 *
 * Taking a lock in exclusive mode is the caller's responsibility.
 */
RT_SCOPE bool
RT_DELETE(RT_RADIX_TREE * tree, uint64 key)
{
	bool		deleted;

#ifdef RT_SHMEM
	Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
#endif

	if (key > tree->ctl->max_val)
		return false;

	Assert(RT_PTR_ALLOC_IS_VALID(tree->ctl->root));
	deleted = RT_DELETE_RECURSIVE(tree, &tree->ctl->root,
								  key, tree->ctl->start_shift);

	/* Found the key to delete. Update the statistics */
	if (deleted)
	{
		tree->ctl->num_keys--;
		Assert(tree->ctl->num_keys >= 0);
	}

	return deleted;
}

#endif							/* RT_USE_DELETE */

/***************** UTILITY FUNCTIONS *****************/

/*
 * Return the statistics of the amount of memory used by the radix tree.
 *
 * Since dsa_get_total_size() does appropriate locking, the caller doesn't
 * need to take a lock.
 */
RT_SCOPE uint64
RT_MEMORY_USAGE(RT_RADIX_TREE * tree)
{
	size_t		total = 0;

#ifdef RT_SHMEM
	Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
	total = dsa_get_total_size(tree->dsa);
#else
	total = MemoryContextMemAllocated(tree->context, true);
#endif

	return total;
}

/*
 * Perform some sanity checks on the given node.
 */
static void
RT_VERIFY_NODE(RT_NODE * node)
{
#ifdef USE_ASSERT_CHECKING

	switch (node->kind)
	{
		case RT_NODE_KIND_4:
			{
				RT_NODE_4  *n4 = (RT_NODE_4 *) node;

				/* RT_DUMP_NODE(node); */

				for (int i = 1; i < n4->base.count; i++)
					Assert(n4->chunks[i - 1] < n4->chunks[i]);

				break;
			}
		case RT_NODE_KIND_16:
			{
				RT_NODE_16 *n16 = (RT_NODE_16 *) node;

				/* RT_DUMP_NODE(node); */

				for (int i = 1; i < n16->base.count; i++)
					Assert(n16->chunks[i - 1] < n16->chunks[i]);

				break;
			}
		case RT_NODE_KIND_48:
			{
				RT_NODE_48 *n48 = (RT_NODE_48 *) node;
				int			cnt = 0;

				/* RT_DUMP_NODE(node); */

				for (int i = 0; i < RT_NODE_MAX_SLOTS; i++)
				{
					uint8		slot = n48->slot_idxs[i];
					int			idx = RT_BM_IDX(slot);
					int			bitnum = RT_BM_BIT(slot);

					if (!RT_NODE_48_IS_CHUNK_USED(n48, i))
						continue;

					/* Check if the corresponding slot is used */
					Assert(slot < node->fanout);
					Assert((n48->isset[idx] & ((bitmapword) 1 << bitnum)) != 0);

					cnt++;
				}

				Assert(n48->base.count == cnt);

				break;
			}
		case RT_NODE_KIND_256:
			{
				RT_NODE_256 *n256 = (RT_NODE_256 *) node;
				int			cnt = 0;

				/* RT_DUMP_NODE(node); */

				for (int i = 0; i < RT_BM_IDX(RT_NODE_MAX_SLOTS); i++)
					cnt += bmw_popcount(n256->isset[i]);

				/*
				 * Check if the number of used chunk matches, accounting for
				 * overflow
				 */
				if (cnt == RT_FANOUT_256)
					Assert(n256->base.count == 0);
				else
					Assert(n256->base.count == cnt);

				break;
			}
	}
#endif
}

/***************** DEBUG FUNCTIONS *****************/

#ifdef RT_DEBUG

/*
 * Print out tree stats, some of which are only collected in debugging builds.
 */
RT_SCOPE void
RT_STATS(RT_RADIX_TREE * tree)
{
	fprintf(stderr, "max_val = " UINT64_FORMAT "\n", tree->ctl->max_val);
	fprintf(stderr, "num_keys = %lld\n", (long long) tree->ctl->num_keys);

#ifdef RT_SHMEM
	fprintf(stderr, "handle = " DSA_POINTER_FORMAT "\n", tree->ctl->handle);
#endif

	fprintf(stderr, "height = %d", tree->ctl->start_shift / RT_SPAN);

	for (int i = 0; i < RT_NUM_SIZE_CLASSES; i++)
	{
		RT_SIZE_CLASS_ELEM size_class = RT_SIZE_CLASS_INFO[i];

		fprintf(stderr, ", n%d = %lld", size_class.fanout, (long long) tree->ctl->num_nodes[i]);
	}

	fprintf(stderr, ", leaves = %lld", (long long) tree->ctl->num_leaves);

	fprintf(stderr, "\n");
}

/*
 * Print out debugging information about the given node.
 */
static void
pg_attribute_unused()
RT_DUMP_NODE(RT_NODE * node)
{
#ifdef RT_SHMEM
#define RT_CHILD_PTR_FORMAT DSA_POINTER_FORMAT
#else
#define RT_CHILD_PTR_FORMAT "%p"
#endif

	fprintf(stderr, "kind %d, fanout %d, count %u\n",
			(node->kind == RT_NODE_KIND_4) ? 4 :
			(node->kind == RT_NODE_KIND_16) ? 16 :
			(node->kind == RT_NODE_KIND_48) ? 48 : 256,
			node->fanout == 0 ? 256 : node->fanout,
			node->count == 0 ? 256 : node->count);

	switch (node->kind)
	{
		case RT_NODE_KIND_4:
			{
				RT_NODE_4  *n4 = (RT_NODE_4 *) node;

				fprintf(stderr, "chunks and slots:\n");
				for (int i = 0; i < n4->base.count; i++)
				{
					fprintf(stderr, "  [%d] chunk %x slot " RT_CHILD_PTR_FORMAT "\n",
							i, n4->chunks[i], n4->children[i]);
				}

				break;
			}
		case RT_NODE_KIND_16:
			{
				RT_NODE_16 *n16 = (RT_NODE_16 *) node;

				fprintf(stderr, "chunks and slots:\n");
				for (int i = 0; i < n16->base.count; i++)
				{
					fprintf(stderr, "  [%d] chunk %x slot " RT_CHILD_PTR_FORMAT "\n",
							i, n16->chunks[i], n16->children[i]);
				}
				break;
			}
		case RT_NODE_KIND_48:
			{
				RT_NODE_48 *n48 = (RT_NODE_48 *) node;
				char	   *sep = "";

				fprintf(stderr, "slot_idxs: \n");
				for (int chunk = 0; chunk < RT_NODE_MAX_SLOTS; chunk++)
				{
					if (!RT_NODE_48_IS_CHUNK_USED(n48, chunk))
						continue;

					fprintf(stderr, "  idx[%d] = %d\n",
							chunk, n48->slot_idxs[chunk]);
				}

				fprintf(stderr, "isset-bitmap: ");
				for (int i = 0; i < (RT_FANOUT_48_MAX / BITS_PER_BYTE); i++)
				{
					fprintf(stderr, "%s%x", sep, ((uint8 *) n48->isset)[i]);
					sep = " ";
				}
				fprintf(stderr, "\n");

				fprintf(stderr, "chunks and slots:\n");
				for (int chunk = 0; chunk < RT_NODE_MAX_SLOTS; chunk++)
				{
					if (!RT_NODE_48_IS_CHUNK_USED(n48, chunk))
						continue;

					fprintf(stderr, "  chunk %x slot " RT_CHILD_PTR_FORMAT "\n",
							chunk,
							*RT_NODE_48_GET_CHILD(n48, chunk));
				}
				break;
			}
		case RT_NODE_KIND_256:
			{
				RT_NODE_256 *n256 = (RT_NODE_256 *) node;
				char	   *sep = "";

				fprintf(stderr, "isset-bitmap: ");
				for (int i = 0; i < (RT_FANOUT_256 / BITS_PER_BYTE); i++)
				{
					fprintf(stderr, "%s%x", sep, ((uint8 *) n256->isset)[i]);
					sep = " ";
				}
				fprintf(stderr, "\n");

				fprintf(stderr, "chunks and slots:\n");
				for (int chunk = 0; chunk < RT_NODE_MAX_SLOTS; chunk++)
				{
					if (!RT_NODE_256_IS_CHUNK_USED(n256, chunk))
						continue;

					fprintf(stderr, "  chunk %x slot " RT_CHILD_PTR_FORMAT "\n",
							chunk,
							*RT_NODE_256_GET_CHILD(n256, chunk));
				}
				break;
			}
	}
}
#endif							/* RT_DEBUG */

#endif							/* RT_DEFINE */


/* undefine external parameters, so next radix tree can be defined */
#undef RT_PREFIX
#undef RT_SCOPE
#undef RT_DECLARE
#undef RT_DEFINE
#undef RT_VALUE_TYPE
#undef RT_VARLEN_VALUE_SIZE
#undef RT_RUNTIME_EMBEDDABLE_VALUE
#undef RT_SHMEM
#undef RT_USE_DELETE
#undef RT_DEBUG

/* locally declared macros */
#undef RT_MAKE_PREFIX
#undef RT_MAKE_NAME
#undef RT_MAKE_NAME_
#undef RT_STR
#undef RT_STR_
#undef RT_SPAN
#undef RT_NODE_MAX_SLOTS
#undef RT_CHUNK_MASK
#undef RT_MAX_SHIFT
#undef RT_MAX_LEVEL
#undef RT_GET_KEY_CHUNK
#undef RT_BM_IDX
#undef RT_BM_BIT
#undef RT_NODE_MUST_GROW
#undef RT_NODE_KIND_COUNT
#undef RT_NUM_SIZE_CLASSES
#undef RT_INVALID_SLOT_IDX
#undef RT_SLAB_BLOCK_SIZE
#undef RT_RADIX_TREE_MAGIC
#undef RT_CHILD_PTR_FORMAT

/* type declarations */
#undef RT_RADIX_TREE
#undef RT_RADIX_TREE_CONTROL
#undef RT_CHILD_PTR
#undef RT_PTR_ALLOC
#undef RT_INVALID_PTR_ALLOC
#undef RT_HANDLE
#undef RT_ITER
#undef RT_NODE
#undef RT_NODE_ITER
#undef RT_NODE_KIND_4
#undef RT_NODE_KIND_16
#undef RT_NODE_KIND_48
#undef RT_NODE_KIND_256
#undef RT_NODE_4
#undef RT_NODE_16
#undef RT_NODE_48
#undef RT_NODE_256
#undef RT_SIZE_CLASS
#undef RT_SIZE_CLASS_ELEM
#undef RT_SIZE_CLASS_INFO
#undef RT_CLASS_4
#undef RT_CLASS_16_LO
#undef RT_CLASS_16_HI
#undef RT_CLASS_48
#undef RT_CLASS_256
#undef RT_FANOUT_4
#undef RT_FANOUT_4_MAX
#undef RT_FANOUT_16_LO
#undef RT_FANOUT_16_HI
#undef RT_FANOUT_16_MAX
#undef RT_FANOUT_48
#undef RT_FANOUT_48_MAX
#undef RT_FANOUT_256

/* function declarations */
#undef RT_CREATE
#undef RT_FREE
#undef RT_ATTACH
#undef RT_DETACH
#undef RT_LOCK_EXCLUSIVE
#undef RT_LOCK_SHARE
#undef RT_UNLOCK
#undef RT_GET_HANDLE
#undef RT_FIND
#undef RT_SET
#undef RT_BEGIN_ITERATE
#undef RT_ITERATE_NEXT
#undef RT_END_ITERATE
#undef RT_DELETE
#undef RT_MEMORY_USAGE
#undef RT_DUMP_NODE
#undef RT_STATS

/* internal helper functions */
#undef RT_GET_VALUE_SIZE
#undef RT_VALUE_IS_EMBEDDABLE
#undef RT_CHILDPTR_IS_VALUE
#undef RT_GET_SLOT_RECURSIVE
#undef RT_DELETE_RECURSIVE
#undef RT_ALLOC_NODE
#undef RT_ALLOC_LEAF
#undef RT_FREE_NODE
#undef RT_FREE_LEAF
#undef RT_FREE_RECURSE
#undef RT_EXTEND_UP
#undef RT_EXTEND_DOWN
#undef RT_COPY_COMMON
#undef RT_PTR_SET_LOCAL
#undef RT_PTR_ALLOC_IS_VALID
#undef RT_NODE_16_SEARCH_EQ
#undef RT_NODE_4_GET_INSERTPOS
#undef RT_NODE_16_GET_INSERTPOS
#undef RT_SHIFT_ARRAYS_FOR_INSERT
#undef RT_SHIFT_ARRAYS_AND_DELETE
#undef RT_COPY_ARRAYS_FOR_INSERT
#undef RT_COPY_ARRAYS_AND_DELETE
#undef RT_NODE_48_IS_CHUNK_USED
#undef RT_NODE_48_GET_CHILD
#undef RT_NODE_256_IS_CHUNK_USED
#undef RT_NODE_256_GET_CHILD
#undef RT_KEY_GET_SHIFT
#undef RT_SHIFT_GET_MAX_VAL
#undef RT_NODE_SEARCH
#undef RT_ADD_CHILD_4
#undef RT_ADD_CHILD_16
#undef RT_ADD_CHILD_48
#undef RT_ADD_CHILD_256
#undef RT_GROW_NODE_4
#undef RT_GROW_NODE_16
#undef RT_GROW_NODE_48
#undef RT_REMOVE_CHILD_4
#undef RT_REMOVE_CHILD_16
#undef RT_REMOVE_CHILD_48
#undef RT_REMOVE_CHILD_256
#undef RT_SHRINK_NODE_16
#undef RT_SHRINK_NODE_48
#undef RT_SHRINK_NODE_256
#undef RT_NODE_DELETE
#undef RT_NODE_INSERT
#undef RT_NODE_ITERATE_NEXT
#undef RT_VERIFY_NODE