postgresql/src/include/lib/radixtree.h

3053 lines
79 KiB
C

/*-------------------------------------------------------------------------
*
* radixtree.h
* Template for adaptive radix tree.
*
* A template to generate an "adaptive radix tree", specialized for value
* types and for local/shared memory.
*
* The concept originates from the paper "The Adaptive Radix Tree: ARTful
* Indexing for Main-Memory Databases" by Viktor Leis, Alfons Kemper,
* and Thomas Neumann, 2013.
*
* Radix trees have some advantages over hash tables:
* - The keys are logically ordered, allowing efficient sorted iteration
* and range queries
* - Operations using keys that are lexicographically close together
* will have favorable memory locality
* - Memory use grows gradually rather than by doubling
* - The key does not need to be stored with the value, since the key
* is implicitly contained in the path to the value
*
* Some disadvantages are:
* - Point queries (along with insertion and deletion) are slower than
* a linear probing hash table as in simplehash.h
* - Memory usage varies by key distribution, so is difficult to predict
*
* A classic radix tree consists of nodes, each containing an array of
* pointers to child nodes. The size of the array is determined by the
* "span" of the tree, which is the number of bits of the key used to
* index into the array. For example, with a span of 6, a "chunk"
* of 6 bits is extracted from the key at each node traversal, and
* the arrays thus have a "fanout" of 2^6 or 64 entries. A large span
* allows a shorter tree, but requires larger arrays that may be mostly
* wasted space.
*
* The key idea of the adaptive radix tree is to choose different
* data structures based on the number of child nodes. A node will
* start out small when it is first populated, and when it is full,
* it is replaced by the next larger size. Conversely, when a node
* becomes mostly empty, it is replaced by the next smaller node. The
* bulk of the code complexity in this module stems from this dynamic
* switching. One mitigating factor is using a span of 8, since bytes
* are directly addressable.
*
* The ART paper mentions three ways to implement leaves:
*
* "- Single-value leaves: The values are stored using an addi-
* tional leaf node type which stores one value.
* - Multi-value leaves: The values are stored in one of four
* different leaf node types, which mirror the structure of
* inner nodes, but contain values instead of pointers.
* - Combined pointer/value slots: If values fit into point-
* ers, no separate node types are necessary. Instead, each
* pointer storage location in an inner node can either
* store a pointer or a value."
*
* We use a form of "combined pointer/value slots", as recommended. Values
* of size (if fixed at compile time) equal or smaller than the platform's
* pointer type are stored in the child slots of the last level node,
* while larger values are the same as "single-value" leaves above. This
* offers flexibility and efficiency. Variable-length types are currently
* treated as single-value leaves for simplicity, but future work may
* allow those to be stored in the child pointer arrays, when they're
* small enough.
*
* There are two other techniques described in the paper that are not
* implemented here:
* - path compression "...removes all inner nodes that have only a single child."
* - lazy path expansion "...inner nodes are only created if they are required
* to distinguish at least two leaf nodes."
*
* We do have a form of "poor man's path compression", however, enabled by
* only supporting unsigned integer keys (for now assumed to be 64-bit):
* A tree doesn't contain paths where the highest bytes of all keys are
* zero. That way, the tree's height adapts to the distribution of keys.
*
* To handle concurrency, we use a single reader-writer lock for the
* radix tree. If concurrent write operations are possible, the tree
* must be exclusively locked during write operations such as RT_SET()
* and RT_DELETE(), and share locked during read operations such as
* RT_FIND() and RT_BEGIN_ITERATE().
*
* TODO: The current locking mechanism is not optimized for high
* concurrency with mixed read-write workloads. In the future it might
* be worthwhile to replace it with the Optimistic Lock Coupling or
* ROWEX mentioned in the paper "The ART of Practical Synchronization"
* by the same authors as the ART paper, 2016.
*
* To generate a radix tree and associated functions for a use case
* several macros have to be #define'ed before this file is included.
* Including the file #undef's all those, so a new radix tree can be
* generated afterwards.
*
* The relevant parameters are:
* - RT_PREFIX - prefix for all symbol names generated. A prefix of "foo"
* will result in radix tree type "foo_radix_tree" and functions like
* "foo_create"/"foo_free" and so forth.
* - RT_DECLARE - if defined function prototypes and type declarations are
* generated
* - RT_DEFINE - if defined function definitions are generated
* - RT_SCOPE - in which scope (e.g. extern, static inline) do function
* declarations reside
* - RT_VALUE_TYPE - the type of the value.
* - RT_VARLEN_VALUE_SIZE() - for variable length values, an expression
* involving a pointer to the value type, to calculate size.
* NOTE: implies that the value is in fact variable-length,
* so do not set for fixed-length values.
* - RT_RUNTIME_EMBEDDABLE_VALUE - for variable length values, allows
* storing the value in a child pointer slot, rather than as a single-
* value leaf, if small enough. This requires that the value, when
* read as a child pointer, can be tagged in the lowest bit.
*
* Optional parameters:
* - RT_SHMEM - if defined, the radix tree is created in the DSA area
* so that multiple processes can access it simultaneously.
* - RT_DEBUG - if defined add stats tracking and debugging functions
*
* Interface
* ---------
*
* RT_CREATE - Create a new, empty radix tree
* RT_FREE - Free the radix tree
* RT_FIND - Lookup the value for a given key
* RT_SET - Set a key-value pair
* RT_BEGIN_ITERATE - Begin iterating through all key-value pairs
* RT_ITERATE_NEXT - Return next key-value pair, if any
* RT_END_ITERATE - End iteration
* RT_MEMORY_USAGE - Get the memory as measured by space in memory context blocks
*
* Interface for Shared Memory
* ---------
*
* RT_ATTACH - Attach to the radix tree
* RT_DETACH - Detach from the radix tree
* RT_LOCK_EXCLUSIVE - Lock the radix tree in exclusive mode
* RT_LOCK_SHARE - Lock the radix tree in share mode
* RT_UNLOCK - Unlock the radix tree
* RT_GET_HANDLE - Return the handle of the radix tree
*
* Optional Interface
* ---------
*
* RT_DELETE - Delete a key-value pair. Declared/defined if RT_USE_DELETE is defined
*
*
* Copyright (c) 2024, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/include/lib/radixtree.h
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "nodes/bitmapset.h"
#include "port/pg_bitutils.h"
#include "port/simd.h"
#include "utils/dsa.h"
#include "utils/memutils.h"
/* helpers */
#define RT_MAKE_PREFIX(a) CppConcat(a,_)
#define RT_MAKE_NAME(name) RT_MAKE_NAME_(RT_MAKE_PREFIX(RT_PREFIX),name)
#define RT_MAKE_NAME_(a,b) CppConcat(a,b)
/*
* stringify a macro constant, from https://gcc.gnu.org/onlinedocs/cpp/Stringizing.html
*/
#define RT_STR(s) RT_STR_(s)
#define RT_STR_(s) #s
/* function declarations */
#define RT_CREATE RT_MAKE_NAME(create)
#define RT_FREE RT_MAKE_NAME(free)
#define RT_FIND RT_MAKE_NAME(find)
#ifdef RT_SHMEM
#define RT_ATTACH RT_MAKE_NAME(attach)
#define RT_DETACH RT_MAKE_NAME(detach)
#define RT_GET_HANDLE RT_MAKE_NAME(get_handle)
#define RT_LOCK_EXCLUSIVE RT_MAKE_NAME(lock_exclusive)
#define RT_LOCK_SHARE RT_MAKE_NAME(lock_share)
#define RT_UNLOCK RT_MAKE_NAME(unlock)
#endif
#define RT_SET RT_MAKE_NAME(set)
#define RT_BEGIN_ITERATE RT_MAKE_NAME(begin_iterate)
#define RT_ITERATE_NEXT RT_MAKE_NAME(iterate_next)
#define RT_END_ITERATE RT_MAKE_NAME(end_iterate)
#ifdef RT_USE_DELETE
#define RT_DELETE RT_MAKE_NAME(delete)
#endif
#define RT_MEMORY_USAGE RT_MAKE_NAME(memory_usage)
#define RT_DUMP_NODE RT_MAKE_NAME(dump_node)
#define RT_STATS RT_MAKE_NAME(stats)
/* internal helper functions (no externally visible prototypes) */
#define RT_CHILDPTR_IS_VALUE RT_MAKE_NAME(childptr_is_value)
#define RT_VALUE_IS_EMBEDDABLE RT_MAKE_NAME(value_is_embeddable)
#define RT_GET_SLOT_RECURSIVE RT_MAKE_NAME(get_slot_recursive)
#define RT_DELETE_RECURSIVE RT_MAKE_NAME(delete_recursive)
#define RT_ALLOC_NODE RT_MAKE_NAME(alloc_node)
#define RT_ALLOC_LEAF RT_MAKE_NAME(alloc_leaf)
#define RT_FREE_NODE RT_MAKE_NAME(free_node)
#define RT_FREE_LEAF RT_MAKE_NAME(free_leaf)
#define RT_FREE_RECURSE RT_MAKE_NAME(free_recurse)
#define RT_EXTEND_UP RT_MAKE_NAME(extend_up)
#define RT_EXTEND_DOWN RT_MAKE_NAME(extend_down)
#define RT_COPY_COMMON RT_MAKE_NAME(copy_common)
#define RT_PTR_SET_LOCAL RT_MAKE_NAME(ptr_set_local)
#define RT_NODE_16_SEARCH_EQ RT_MAKE_NAME(node_16_search_eq)
#define RT_NODE_4_GET_INSERTPOS RT_MAKE_NAME(node_4_get_insertpos)
#define RT_NODE_16_GET_INSERTPOS RT_MAKE_NAME(node_16_get_insertpos)
#define RT_SHIFT_ARRAYS_FOR_INSERT RT_MAKE_NAME(shift_arrays_for_insert)
#define RT_SHIFT_ARRAYS_AND_DELETE RT_MAKE_NAME(shift_arrays_and_delete)
#define RT_COPY_ARRAYS_FOR_INSERT RT_MAKE_NAME(copy_arrays_for_insert)
#define RT_COPY_ARRAYS_AND_DELETE RT_MAKE_NAME(copy_arrays_and_delete)
#define RT_NODE_48_IS_CHUNK_USED RT_MAKE_NAME(node_48_is_chunk_used)
#define RT_NODE_48_GET_CHILD RT_MAKE_NAME(node_48_get_child)
#define RT_NODE_256_IS_CHUNK_USED RT_MAKE_NAME(node_256_is_chunk_used)
#define RT_NODE_256_GET_CHILD RT_MAKE_NAME(node_256_get_child)
#define RT_KEY_GET_SHIFT RT_MAKE_NAME(key_get_shift)
#define RT_SHIFT_GET_MAX_VAL RT_MAKE_NAME(shift_get_max_val)
#define RT_NODE_SEARCH RT_MAKE_NAME(node_search)
#define RT_NODE_DELETE RT_MAKE_NAME(node_delete)
#define RT_NODE_INSERT RT_MAKE_NAME(node_insert)
#define RT_ADD_CHILD_4 RT_MAKE_NAME(add_child_4)
#define RT_ADD_CHILD_16 RT_MAKE_NAME(add_child_16)
#define RT_ADD_CHILD_48 RT_MAKE_NAME(add_child_48)
#define RT_ADD_CHILD_256 RT_MAKE_NAME(add_child_256)
#define RT_GROW_NODE_4 RT_MAKE_NAME(grow_node_4)
#define RT_GROW_NODE_16 RT_MAKE_NAME(grow_node_16)
#define RT_GROW_NODE_48 RT_MAKE_NAME(grow_node_48)
#define RT_REMOVE_CHILD_4 RT_MAKE_NAME(remove_child_4)
#define RT_REMOVE_CHILD_16 RT_MAKE_NAME(remove_child_16)
#define RT_REMOVE_CHILD_48 RT_MAKE_NAME(remove_child_48)
#define RT_REMOVE_CHILD_256 RT_MAKE_NAME(remove_child_256)
#define RT_SHRINK_NODE_16 RT_MAKE_NAME(shrink_child_16)
#define RT_SHRINK_NODE_48 RT_MAKE_NAME(shrink_child_48)
#define RT_SHRINK_NODE_256 RT_MAKE_NAME(shrink_child_256)
#define RT_NODE_ITERATE_NEXT RT_MAKE_NAME(node_iterate_next)
#define RT_VERIFY_NODE RT_MAKE_NAME(verify_node)
/* type declarations */
#define RT_RADIX_TREE RT_MAKE_NAME(radix_tree)
#define RT_RADIX_TREE_CONTROL RT_MAKE_NAME(radix_tree_control)
#define RT_ITER RT_MAKE_NAME(iter)
#ifdef RT_SHMEM
#define RT_HANDLE RT_MAKE_NAME(handle)
#endif
#define RT_NODE RT_MAKE_NAME(node)
#define RT_CHILD_PTR RT_MAKE_NAME(child_ptr)
#define RT_NODE_ITER RT_MAKE_NAME(node_iter)
#define RT_NODE_4 RT_MAKE_NAME(node_4)
#define RT_NODE_16 RT_MAKE_NAME(node_16)
#define RT_NODE_48 RT_MAKE_NAME(node_48)
#define RT_NODE_256 RT_MAKE_NAME(node_256)
#define RT_SIZE_CLASS RT_MAKE_NAME(size_class)
#define RT_SIZE_CLASS_ELEM RT_MAKE_NAME(size_class_elem)
#define RT_SIZE_CLASS_INFO RT_MAKE_NAME(size_class_info)
#define RT_CLASS_4 RT_MAKE_NAME(class_4)
#define RT_CLASS_16_LO RT_MAKE_NAME(class_32_min)
#define RT_CLASS_16_HI RT_MAKE_NAME(class_32_max)
#define RT_CLASS_48 RT_MAKE_NAME(class_48)
#define RT_CLASS_256 RT_MAKE_NAME(class_256)
/* generate forward declarations necessary to use the radix tree */
#ifdef RT_DECLARE
typedef struct RT_RADIX_TREE RT_RADIX_TREE;
typedef struct RT_ITER RT_ITER;
#ifdef RT_SHMEM
typedef dsa_pointer RT_HANDLE;
#endif
#ifdef RT_SHMEM
RT_SCOPE RT_RADIX_TREE *RT_CREATE(MemoryContext ctx, dsa_area *dsa, int tranche_id);
RT_SCOPE RT_RADIX_TREE *RT_ATTACH(dsa_area *dsa, dsa_pointer dp);
RT_SCOPE void RT_DETACH(RT_RADIX_TREE * tree);
RT_SCOPE RT_HANDLE RT_GET_HANDLE(RT_RADIX_TREE * tree);
RT_SCOPE void RT_LOCK_EXCLUSIVE(RT_RADIX_TREE * tree);
RT_SCOPE void RT_LOCK_SHARE(RT_RADIX_TREE * tree);
RT_SCOPE void RT_UNLOCK(RT_RADIX_TREE * tree);
#else
RT_SCOPE RT_RADIX_TREE *RT_CREATE(MemoryContext ctx);
#endif
RT_SCOPE void RT_FREE(RT_RADIX_TREE * tree);
RT_SCOPE RT_VALUE_TYPE *RT_FIND(RT_RADIX_TREE * tree, uint64 key);
RT_SCOPE bool RT_SET(RT_RADIX_TREE * tree, uint64 key, RT_VALUE_TYPE * value_p);
#ifdef RT_USE_DELETE
RT_SCOPE bool RT_DELETE(RT_RADIX_TREE * tree, uint64 key);
#endif
RT_SCOPE RT_ITER *RT_BEGIN_ITERATE(RT_RADIX_TREE * tree);
RT_SCOPE RT_VALUE_TYPE *RT_ITERATE_NEXT(RT_ITER * iter, uint64 *key_p);
RT_SCOPE void RT_END_ITERATE(RT_ITER * iter);
RT_SCOPE uint64 RT_MEMORY_USAGE(RT_RADIX_TREE * tree);
#ifdef RT_DEBUG
RT_SCOPE void RT_STATS(RT_RADIX_TREE * tree);
#endif
#endif /* RT_DECLARE */
/* generate implementation of the radix tree */
#ifdef RT_DEFINE
/* The number of bits encoded in one tree level */
#define RT_SPAN BITS_PER_BYTE
/*
* The number of possible partial keys, and thus the maximum number of
* child pointers, for a node.
*/
#define RT_NODE_MAX_SLOTS (1 << RT_SPAN)
/* Mask for extracting a chunk from a key */
#define RT_CHUNK_MASK ((1 << RT_SPAN) - 1)
/* Maximum shift needed to extract a chunk from a key */
#define RT_MAX_SHIFT RT_KEY_GET_SHIFT(UINT64_MAX)
/* Maximum level a tree can reach for a key */
#define RT_MAX_LEVEL ((sizeof(uint64) * BITS_PER_BYTE) / RT_SPAN)
/* Get a chunk from the key */
#define RT_GET_KEY_CHUNK(key, shift) ((uint8) (((key) >> (shift)) & RT_CHUNK_MASK))
/* For accessing bitmaps */
#define RT_BM_IDX(x) ((x) / BITS_PER_BITMAPWORD)
#define RT_BM_BIT(x) ((x) % BITS_PER_BITMAPWORD)
/*
* Node kinds
*
* The different node kinds are what make the tree "adaptive".
*
* Each node kind is associated with a different datatype and different
* search/set/delete/iterate algorithms adapted for its size. The largest
* kind, node256 is basically the same as a traditional radix tree,
* and would be most wasteful of memory when sparsely populated. The
* smaller nodes expend some additional CPU time to enable a smaller
* memory footprint.
*
* NOTE: There are 4 node kinds, and this should never be increased,
* for several reasons:
* 1. With 5 or more kinds, gcc tends to use a jump table for switch
* statements.
* 2. The 4 kinds can be represented with 2 bits, so we have the option
* in the future to tag the node pointer with the kind, even on
* platforms with 32-bit pointers. That would touch fewer cache lines
* during traversal and allow faster recovery from branch mispredicts.
* 3. We can have multiple size classes per node kind.
*/
#define RT_NODE_KIND_4 0x00
#define RT_NODE_KIND_16 0x01
#define RT_NODE_KIND_48 0x02
#define RT_NODE_KIND_256 0x03
#define RT_NODE_KIND_COUNT 4
/*
* Calculate the slab block size so that we can allocate at least 32 chunks
* from the block.
*/
#define RT_SLAB_BLOCK_SIZE(size) \
Max(SLAB_DEFAULT_BLOCK_SIZE, pg_nextpower2_32(size * 32))
/* Common header for all nodes */
typedef struct RT_NODE
{
/* Node kind, one per search/set algorithm */
uint8 kind;
/*
* Max capacity for the current size class. Storing this in the node
* enables multiple size classes per node kind. uint8 is sufficient for
* all node kinds, because we only use this number to test if the node
* needs to grow. Since node256 never needs to grow, we let this overflow
* to zero.
*/
uint8 fanout;
/*
* Number of children. uint8 is sufficient for all node kinds, because
* nodes shrink when this number gets lower than some threshold. Since
* node256 cannot possibly have zero children, we let the counter overflow
* and we interpret zero as "256" for this node kind.
*/
uint8 count;
} RT_NODE;
/* pointer returned by allocation */
#ifdef RT_SHMEM
#define RT_PTR_ALLOC dsa_pointer
#define RT_INVALID_PTR_ALLOC InvalidDsaPointer
#define RT_PTR_ALLOC_IS_VALID(ptr) DsaPointerIsValid(ptr)
#else
#define RT_PTR_ALLOC RT_NODE *
#define RT_INVALID_PTR_ALLOC NULL
#define RT_PTR_ALLOC_IS_VALID(ptr) PointerIsValid(ptr)
#endif
/*
* A convenience type used when we need to work with a DSA pointer as well
* as its local pointer. For local memory, both members are the same, so
* we use a union.
*/
#ifdef RT_SHMEM
typedef struct RT_CHILD_PTR
#else
typedef union RT_CHILD_PTR
#endif
{
RT_PTR_ALLOC alloc;
RT_NODE *local;
} RT_CHILD_PTR;
/*
* Helper macros and functions for value storage.
* We either embed values in the child slots of the last level
* node or store pointers to values to the child slots,
* depending on the value size.
*/
#ifdef RT_VARLEN_VALUE_SIZE
#define RT_GET_VALUE_SIZE(v) RT_VARLEN_VALUE_SIZE(v)
#else
#define RT_GET_VALUE_SIZE(v) sizeof(RT_VALUE_TYPE)
#endif
/*
* Return true if the value can be stored in the child array
* of the lowest-level node, false otherwise.
*/
static inline bool
RT_VALUE_IS_EMBEDDABLE(RT_VALUE_TYPE * value_p)
{
#ifdef RT_VARLEN_VALUE_SIZE
#ifdef RT_RUNTIME_EMBEDDABLE_VALUE
return RT_GET_VALUE_SIZE(value_p) <= sizeof(RT_PTR_ALLOC);
#else
return false;
#endif
#else
return RT_GET_VALUE_SIZE(value_p) <= sizeof(RT_PTR_ALLOC);
#endif
}
/*
* Return true if the child pointer contains the value, false
* if the child pointer is a leaf pointer.
*/
static inline bool
RT_CHILDPTR_IS_VALUE(RT_PTR_ALLOC child)
{
#ifdef RT_VARLEN_VALUE_SIZE
#ifdef RT_RUNTIME_EMBEDDABLE_VALUE
/* check for pointer tag */
#ifdef RT_SHMEM
return child & 1;
#else
return ((uintptr_t) child) & 1;
#endif
#else
return false;
#endif
#else
return sizeof(RT_VALUE_TYPE) <= sizeof(RT_PTR_ALLOC);
#endif
}
/*
* Symbols for maximum possible fanout are declared first as they are
* required to declare each node kind. The declarations of other fanout
* values are followed as they need the struct sizes of each node kind.
*/
/* max possible key chunks without struct padding */
#define RT_FANOUT_4_MAX (8 - sizeof(RT_NODE))
/* equal to two 128-bit SIMD registers, regardless of availability */
#define RT_FANOUT_16_MAX 32
/*
* This also determines the number of bits necessary for the isset array,
* so we need to be mindful of the size of bitmapword. Since bitmapword
* can be 64 bits, the only values that make sense here are 64 and 128.
* The ART paper uses at most 64 for this node kind, and one advantage
* for us is that "isset" is a single bitmapword on most platforms,
* rather than an array, allowing the compiler to get rid of loops.
*/
#define RT_FANOUT_48_MAX 64
#define RT_FANOUT_256 RT_NODE_MAX_SLOTS
/*
* Node structs, one for each "kind"
*/
/*
* node4 and node16 use one array for key chunks and another
* array of the same length for children. The keys and children
* are stored at corresponding positions, sorted by chunk.
*/
typedef struct RT_NODE_4
{
RT_NODE base;
uint8 chunks[RT_FANOUT_4_MAX];
/* number of children depends on size class */
RT_PTR_ALLOC children[FLEXIBLE_ARRAY_MEMBER];
} RT_NODE_4;
typedef struct RT_NODE_16
{
RT_NODE base;
uint8 chunks[RT_FANOUT_16_MAX];
/* number of children depends on size class */
RT_PTR_ALLOC children[FLEXIBLE_ARRAY_MEMBER];
} RT_NODE_16;
/*
* node48 uses a 256-element array indexed by key chunks. This array
* stores indexes into a second array containing the children.
*/
typedef struct RT_NODE_48
{
RT_NODE base;
/* The index of slots for each fanout */
uint8 slot_idxs[RT_NODE_MAX_SLOTS];
/* Invalid index */
#define RT_INVALID_SLOT_IDX 0xFF
/* bitmap to track which slots are in use */
bitmapword isset[RT_BM_IDX(RT_FANOUT_48_MAX)];
/* number of children depends on size class */
RT_PTR_ALLOC children[FLEXIBLE_ARRAY_MEMBER];
} RT_NODE_48;
/*
* node256 is the largest node type. This node has an array of
* children directly indexed by chunk. Unlike other node kinds,
* its array size is by definition fixed.
*/
typedef struct RT_NODE_256
{
RT_NODE base;
/* bitmap to track which slots are in use */
bitmapword isset[RT_BM_IDX(RT_FANOUT_256)];
/* slots for 256 children */
RT_PTR_ALLOC children[RT_FANOUT_256];
} RT_NODE_256;
#if defined(RT_SHMEM)
/*
* Make sure the all nodes (except for node256) fit neatly into a DSA
* size class. We assume the RT_FANOUT_4 is in the range where DSA size
* classes increment by 8 (as of PG17 up to 64 bytes), so we just hard
* code that one.
*/
#if SIZEOF_DSA_POINTER < 8
#define RT_FANOUT_16_LO ((96 - offsetof(RT_NODE_16, children)) / sizeof(RT_PTR_ALLOC))
#define RT_FANOUT_16_HI Min(RT_FANOUT_16_MAX, (160 - offsetof(RT_NODE_16, children)) / sizeof(RT_PTR_ALLOC))
#define RT_FANOUT_48 Min(RT_FANOUT_48_MAX, (512 - offsetof(RT_NODE_48, children)) / sizeof(RT_PTR_ALLOC))
#else
#define RT_FANOUT_16_LO ((160 - offsetof(RT_NODE_16, children)) / sizeof(RT_PTR_ALLOC))
#define RT_FANOUT_16_HI Min(RT_FANOUT_16_MAX, (320 - offsetof(RT_NODE_16, children)) / sizeof(RT_PTR_ALLOC))
#define RT_FANOUT_48 Min(RT_FANOUT_48_MAX, (768 - offsetof(RT_NODE_48, children)) / sizeof(RT_PTR_ALLOC))
#endif /* SIZEOF_DSA_POINTER < 8 */
#else /* ! RT_SHMEM */
/* doesn't really matter, but may as well use the namesake */
#define RT_FANOUT_16_LO 16
/* use maximum possible */
#define RT_FANOUT_16_HI RT_FANOUT_16_MAX
#define RT_FANOUT_48 RT_FANOUT_48_MAX
#endif /* RT_SHMEM */
/*
* To save memory in trees with sparse keys, it would make sense to have two
* size classes for the smallest kind (perhaps a high class of 5 and a low class
* of 2), but it would be more effective to utilize lazy expansion and
* path compression.
*/
#define RT_FANOUT_4 4
StaticAssertDecl(RT_FANOUT_4 <= RT_FANOUT_4_MAX, "watch struct padding");
StaticAssertDecl(RT_FANOUT_16_LO < RT_FANOUT_16_HI, "LO subclass bigger than HI");
StaticAssertDecl(RT_FANOUT_48 <= RT_FANOUT_48_MAX, "more slots than isset bits");
/*
* Node size classes
*
* Nodes of different kinds necessarily belong to different size classes.
* One innovation in our implementation compared to the ART paper is
* decoupling the notion of size class from kind.
*
* The size classes within a given node kind have the same underlying
* type, but a variable number of children/values. This is possible
* because each type (except node256) contains metadata that work the
* same way regardless of how many child slots there are. The nodes
* can introspect their allocated capacity at runtime.
*/
typedef enum RT_SIZE_CLASS
{
RT_CLASS_4 = 0,
RT_CLASS_16_LO,
RT_CLASS_16_HI,
RT_CLASS_48,
RT_CLASS_256
} RT_SIZE_CLASS;
/* Information for each size class */
typedef struct RT_SIZE_CLASS_ELEM
{
const char *name;
int fanout;
size_t allocsize;
} RT_SIZE_CLASS_ELEM;
static const RT_SIZE_CLASS_ELEM RT_SIZE_CLASS_INFO[] = {
[RT_CLASS_4] = {
.name = RT_STR(RT_PREFIX) "_radix_tree node4",
.fanout = RT_FANOUT_4,
.allocsize = sizeof(RT_NODE_4) + RT_FANOUT_4 * sizeof(RT_PTR_ALLOC),
},
[RT_CLASS_16_LO] = {
.name = RT_STR(RT_PREFIX) "_radix_tree node16_lo",
.fanout = RT_FANOUT_16_LO,
.allocsize = sizeof(RT_NODE_16) + RT_FANOUT_16_LO * sizeof(RT_PTR_ALLOC),
},
[RT_CLASS_16_HI] = {
.name = RT_STR(RT_PREFIX) "_radix_tree node16_hi",
.fanout = RT_FANOUT_16_HI,
.allocsize = sizeof(RT_NODE_16) + RT_FANOUT_16_HI * sizeof(RT_PTR_ALLOC),
},
[RT_CLASS_48] = {
.name = RT_STR(RT_PREFIX) "_radix_tree node48",
.fanout = RT_FANOUT_48,
.allocsize = sizeof(RT_NODE_48) + RT_FANOUT_48 * sizeof(RT_PTR_ALLOC),
},
[RT_CLASS_256] = {
.name = RT_STR(RT_PREFIX) "_radix_tree node256",
.fanout = RT_FANOUT_256,
.allocsize = sizeof(RT_NODE_256),
},
};
#define RT_NUM_SIZE_CLASSES lengthof(RT_SIZE_CLASS_INFO)
#ifdef RT_SHMEM
/* A magic value used to identify our radix tree */
#define RT_RADIX_TREE_MAGIC 0x54A48167
#endif
/* Contains the actual tree, plus ancillary info */
typedef struct RT_RADIX_TREE_CONTROL
{
#ifdef RT_SHMEM
RT_HANDLE handle;
uint32 magic;
LWLock lock;
#endif
RT_PTR_ALLOC root;
uint64 max_val;
int64 num_keys;
int start_shift;
/* statistics */
#ifdef RT_DEBUG
int64 num_nodes[RT_NUM_SIZE_CLASSES];
int64 num_leaves;
#endif
} RT_RADIX_TREE_CONTROL;
/* Entry point for allocating and accessing the tree */
struct RT_RADIX_TREE
{
MemoryContext context;
/* pointing to either local memory or DSA */
RT_RADIX_TREE_CONTROL *ctl;
#ifdef RT_SHMEM
dsa_area *dsa;
#else
MemoryContextData *node_slabs[RT_NUM_SIZE_CLASSES];
/* leaf_context is used only for single-value leaves */
MemoryContextData *leaf_context;
#endif
MemoryContextData *iter_context;
};
/*
* Iteration support.
*
* Iterating over the radix tree produces each key/value pair in ascending
* order of the key.
*/
/* state for iterating over a single node */
typedef struct RT_NODE_ITER
{
RT_CHILD_PTR node;
/*
* The next index of the chunk array in RT_NODE_KIND_4 and RT_NODE_KIND_16
* nodes, or the next chunk in RT_NODE_KIND_48 and RT_NODE_KIND_256 nodes.
* 0 for the initial value.
*/
int idx;
} RT_NODE_ITER;
/* state for iterating over the whole radix tree */
struct RT_ITER
{
RT_RADIX_TREE *tree;
/*
* A stack to track iteration for each level. Level 0 is the lowest (or
* leaf) level
*/
RT_NODE_ITER node_iters[RT_MAX_LEVEL];
int top_level;
int cur_level;
/* The key constructed during iteration */
uint64 key;
};
/* verification (available only in assert-enabled builds) */
static void RT_VERIFY_NODE(RT_NODE * node);
static inline void
RT_PTR_SET_LOCAL(RT_RADIX_TREE * tree, RT_CHILD_PTR * node)
{
#ifdef RT_SHMEM
node->local = dsa_get_address(tree->dsa, node->alloc);
#endif
}
/* Convenience functions for node48 and node256 */
/* Return true if there is an entry for "chunk" */
static inline bool
RT_NODE_48_IS_CHUNK_USED(RT_NODE_48 * node, uint8 chunk)
{
return node->slot_idxs[chunk] != RT_INVALID_SLOT_IDX;
}
static inline RT_PTR_ALLOC *
RT_NODE_48_GET_CHILD(RT_NODE_48 * node, uint8 chunk)
{
return &node->children[node->slot_idxs[chunk]];
}
/* Return true if there is an entry for "chunk" */
static inline bool
RT_NODE_256_IS_CHUNK_USED(RT_NODE_256 * node, uint8 chunk)
{
int idx = RT_BM_IDX(chunk);
int bitnum = RT_BM_BIT(chunk);
return (node->isset[idx] & ((bitmapword) 1 << bitnum)) != 0;
}
static inline RT_PTR_ALLOC *
RT_NODE_256_GET_CHILD(RT_NODE_256 * node, uint8 chunk)
{
Assert(RT_NODE_256_IS_CHUNK_USED(node, chunk));
return &node->children[chunk];
}
/*
* Return the smallest shift that will allowing storing the given key.
*/
static inline int
RT_KEY_GET_SHIFT(uint64 key)
{
if (key == 0)
return 0;
else
return (pg_leftmost_one_pos64(key) / RT_SPAN) * RT_SPAN;
}
/*
* Return the max value that can be stored in the tree with the given shift.
*/
static uint64
RT_SHIFT_GET_MAX_VAL(int shift)
{
if (shift == RT_MAX_SHIFT)
return UINT64_MAX;
else
return (UINT64CONST(1) << (shift + RT_SPAN)) - 1;
}
/*
* Allocate a new node with the given node kind and size class.
*/
static inline RT_CHILD_PTR
RT_ALLOC_NODE(RT_RADIX_TREE * tree, const uint8 kind, const RT_SIZE_CLASS size_class)
{
RT_CHILD_PTR allocnode;
RT_NODE *node;
size_t allocsize;
allocsize = RT_SIZE_CLASS_INFO[size_class].allocsize;
#ifdef RT_SHMEM
allocnode.alloc = dsa_allocate(tree->dsa, allocsize);
#else
allocnode.alloc = (RT_PTR_ALLOC) MemoryContextAlloc(tree->node_slabs[size_class],
allocsize);
#endif
RT_PTR_SET_LOCAL(tree, &allocnode);
node = allocnode.local;
/* initialize contents */
memset(node, 0, sizeof(RT_NODE));
switch (kind)
{
case RT_NODE_KIND_4:
case RT_NODE_KIND_16:
break;
case RT_NODE_KIND_48:
{
RT_NODE_48 *n48 = (RT_NODE_48 *) node;
memset(n48->isset, 0, sizeof(n48->isset));
memset(n48->slot_idxs, RT_INVALID_SLOT_IDX, sizeof(n48->slot_idxs));
break;
}
case RT_NODE_KIND_256:
{
RT_NODE_256 *n256 = (RT_NODE_256 *) node;
memset(n256->isset, 0, sizeof(n256->isset));
break;
}
default:
pg_unreachable();
}
node->kind = kind;
/*
* For node256, this will actually overflow to zero, but that's okay
* because that node doesn't need to introspect this value.
*/
node->fanout = RT_SIZE_CLASS_INFO[size_class].fanout;
#ifdef RT_DEBUG
/* update the statistics */
tree->ctl->num_nodes[size_class]++;
#endif
return allocnode;
}
/*
* Allocate a new leaf.
*/
static RT_CHILD_PTR
RT_ALLOC_LEAF(RT_RADIX_TREE * tree, size_t allocsize)
{
RT_CHILD_PTR leaf;
#ifdef RT_SHMEM
leaf.alloc = dsa_allocate(tree->dsa, allocsize);
RT_PTR_SET_LOCAL(tree, &leaf);
#else
leaf.alloc = (RT_PTR_ALLOC) MemoryContextAlloc(tree->leaf_context, allocsize);
#endif
#ifdef RT_DEBUG
tree->ctl->num_leaves++;
#endif
return leaf;
}
/*
* Copy relevant members of the node header.
* This is a separate function in case other fields are added.
*/
static inline void
RT_COPY_COMMON(RT_CHILD_PTR newnode, RT_CHILD_PTR oldnode)
{
(newnode.local)->count = (oldnode.local)->count;
}
/* Free the given node */
static void
RT_FREE_NODE(RT_RADIX_TREE * tree, RT_CHILD_PTR node)
{
#ifdef RT_DEBUG
int i;
/* update the statistics */
for (i = 0; i < RT_NUM_SIZE_CLASSES; i++)
{
if ((node.local)->fanout == RT_SIZE_CLASS_INFO[i].fanout)
break;
}
/*
* The fanout of node256 will appear to be zero within the node header
* because of overflow, so we need an extra check here.
*/
if (i == RT_NUM_SIZE_CLASSES)
i = RT_CLASS_256;
tree->ctl->num_nodes[i]--;
Assert(tree->ctl->num_nodes[i] >= 0);
#endif
#ifdef RT_SHMEM
dsa_free(tree->dsa, node.alloc);
#else
pfree(node.alloc);
#endif
}
static inline void
RT_FREE_LEAF(RT_RADIX_TREE * tree, RT_PTR_ALLOC leaf)
{
Assert(leaf != tree->ctl->root);
#ifdef RT_DEBUG
/* update the statistics */
tree->ctl->num_leaves--;
Assert(tree->ctl->num_leaves >= 0);
#endif
#ifdef RT_SHMEM
dsa_free(tree->dsa, leaf);
#else
pfree(leaf);
#endif
}
/***************** SEARCH *****************/
/*
* Return the address of the child corresponding to "chunk",
* or NULL if there is no such element.
*/
static inline RT_PTR_ALLOC *
RT_NODE_16_SEARCH_EQ(RT_NODE_16 * node, uint8 chunk)
{
int count = node->base.count;
#ifndef USE_NO_SIMD
Vector8 spread_chunk;
Vector8 haystack1;
Vector8 haystack2;
Vector8 cmp1;
Vector8 cmp2;
uint32 bitfield;
RT_PTR_ALLOC *slot_simd = NULL;
#endif
#if defined(USE_NO_SIMD) || defined(USE_ASSERT_CHECKING)
RT_PTR_ALLOC *slot = NULL;
for (int i = 0; i < count; i++)
{
if (node->chunks[i] == chunk)
{
slot = &node->children[i];
break;
}
}
#endif
#ifndef USE_NO_SIMD
/* replicate the search key */
spread_chunk = vector8_broadcast(chunk);
/* compare to all 32 keys stored in the node */
vector8_load(&haystack1, &node->chunks[0]);
vector8_load(&haystack2, &node->chunks[sizeof(Vector8)]);
cmp1 = vector8_eq(spread_chunk, haystack1);
cmp2 = vector8_eq(spread_chunk, haystack2);
/* convert comparison to a bitfield */
bitfield = vector8_highbit_mask(cmp1) | (vector8_highbit_mask(cmp2) << sizeof(Vector8));
/* mask off invalid entries */
bitfield &= ((UINT64CONST(1) << count) - 1);
/* convert bitfield to index by counting trailing zeros */
if (bitfield)
slot_simd = &node->children[pg_rightmost_one_pos32(bitfield)];
Assert(slot_simd == slot);
return slot_simd;
#else
return slot;
#endif
}
/*
* Search for the child pointer corresponding to "key" in the given node.
*
* Return child if the key is found, otherwise return NULL.
*/
static inline RT_PTR_ALLOC *
RT_NODE_SEARCH(RT_NODE * node, uint8 chunk)
{
/* Make sure we already converted to local pointer */
Assert(node != NULL);
switch (node->kind)
{
case RT_NODE_KIND_4:
{
RT_NODE_4 *n4 = (RT_NODE_4 *) node;
for (int i = 0; i < n4->base.count; i++)
{
if (n4->chunks[i] == chunk)
return &n4->children[i];
}
return NULL;
}
case RT_NODE_KIND_16:
return RT_NODE_16_SEARCH_EQ((RT_NODE_16 *) node, chunk);
case RT_NODE_KIND_48:
{
RT_NODE_48 *n48 = (RT_NODE_48 *) node;
int slotpos = n48->slot_idxs[chunk];
if (slotpos == RT_INVALID_SLOT_IDX)
return NULL;
return RT_NODE_48_GET_CHILD(n48, chunk);
}
case RT_NODE_KIND_256:
{
RT_NODE_256 *n256 = (RT_NODE_256 *) node;
if (!RT_NODE_256_IS_CHUNK_USED(n256, chunk))
return NULL;
return RT_NODE_256_GET_CHILD(n256, chunk);
}
default:
pg_unreachable();
}
}
/*
* Search the given key in the radix tree. Return the pointer to the value if found,
* otherwise return NULL.
*
* Since the function returns a pointer (to support variable-length values),
* the caller is responsible for locking until it's finished with the value.
*/
RT_SCOPE RT_VALUE_TYPE *
RT_FIND(RT_RADIX_TREE * tree, uint64 key)
{
RT_CHILD_PTR node;
RT_PTR_ALLOC *slot = NULL;
int shift;
#ifdef RT_SHMEM
Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
#endif
if (key > tree->ctl->max_val)
return NULL;
Assert(RT_PTR_ALLOC_IS_VALID(tree->ctl->root));
node.alloc = tree->ctl->root;
shift = tree->ctl->start_shift;
/* Descend the tree */
while (shift >= 0)
{
RT_PTR_SET_LOCAL(tree, &node);
slot = RT_NODE_SEARCH(node.local, RT_GET_KEY_CHUNK(key, shift));
if (slot == NULL)
return NULL;
node.alloc = *slot;
shift -= RT_SPAN;
}
if (RT_CHILDPTR_IS_VALUE(*slot))
return (RT_VALUE_TYPE *) slot;
else
{
RT_PTR_SET_LOCAL(tree, &node);
return (RT_VALUE_TYPE *) node.local;
}
}
/***************** INSERTION *****************/
#define RT_NODE_MUST_GROW(node) \
((node)->count == (node)->fanout)
/*
* Return index of the chunk and slot arrays for inserting into the node,
* such that the arrays remain ordered.
*/
static inline int
RT_NODE_4_GET_INSERTPOS(RT_NODE_4 * node, uint8 chunk, int count)
{
int idx;
for (idx = 0; idx < count; idx++)
{
if (node->chunks[idx] >= chunk)
break;
}
return idx;
}
/*
* Return index of the chunk and slot arrays for inserting into the node,
* such that the arrays remain ordered.
*/
static inline int
RT_NODE_16_GET_INSERTPOS(RT_NODE_16 * node, uint8 chunk)
{
int count = node->base.count;
#if defined(USE_NO_SIMD) || defined(USE_ASSERT_CHECKING)
int index;
#endif
#ifndef USE_NO_SIMD
Vector8 spread_chunk;
Vector8 haystack1;
Vector8 haystack2;
Vector8 cmp1;
Vector8 cmp2;
Vector8 min1;
Vector8 min2;
uint32 bitfield;
int index_simd;
#endif
/*
* First compare the last element. There are two reasons to branch here:
*
* 1) A realistic pattern is inserting ordered keys. In that case,
* non-SIMD platforms must do a linear search to the last chunk to find
* the insert position. This will get slower as the node fills up.
*
* 2) On SIMD platforms, we must branch anyway to make sure we don't bit
* scan an empty bitfield. Doing the branch here eliminates some work that
* we might otherwise throw away.
*/
Assert(count > 0);
if (node->chunks[count - 1] < chunk)
return count;
#if defined(USE_NO_SIMD) || defined(USE_ASSERT_CHECKING)
for (index = 0; index < count; index++)
{
if (node->chunks[index] > chunk)
break;
}
#endif
#ifndef USE_NO_SIMD
/*
* This is a bit more complicated than RT_NODE_16_SEARCH_EQ(), because no
* unsigned uint8 comparison instruction exists, at least for SSE2. So we
* need to play some trickery using vector8_min() to effectively get >=.
* There'll never be any equal elements in current uses, but that's what
* we get here...
*/
spread_chunk = vector8_broadcast(chunk);
vector8_load(&haystack1, &node->chunks[0]);
vector8_load(&haystack2, &node->chunks[sizeof(Vector8)]);
min1 = vector8_min(spread_chunk, haystack1);
min2 = vector8_min(spread_chunk, haystack2);
cmp1 = vector8_eq(spread_chunk, min1);
cmp2 = vector8_eq(spread_chunk, min2);
bitfield = vector8_highbit_mask(cmp1) | (vector8_highbit_mask(cmp2) << sizeof(Vector8));
Assert((bitfield & ((UINT64CONST(1) << count) - 1)) != 0);
index_simd = pg_rightmost_one_pos32(bitfield);
Assert(index_simd == index);
return index_simd;
#else
return index;
#endif
}
/* Shift the elements right at "insertpos" by one */
static inline void
RT_SHIFT_ARRAYS_FOR_INSERT(uint8 *chunks, RT_PTR_ALLOC * children, int count, int insertpos)
{
/*
* This is basically a memmove, but written in a simple loop for speed on
* small inputs.
*/
for (int i = count - 1; i >= insertpos; i--)
{
/* workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101481 */
#ifdef __GNUC__
__asm__("");
#endif
chunks[i + 1] = chunks[i];
children[i + 1] = children[i];
}
}
/*
* Copy both chunk and slot arrays into the right
* place. The caller is responsible for inserting the new element.
*/
static inline void
RT_COPY_ARRAYS_FOR_INSERT(uint8 *dst_chunks, RT_PTR_ALLOC * dst_children,
uint8 *src_chunks, RT_PTR_ALLOC * src_children,
int count, int insertpos)
{
for (int i = 0; i < count; i++)
{
int sourceidx = i;
/* use a branch-free computation to skip the index of the new element */
int destidx = i + (i >= insertpos);
dst_chunks[destidx] = src_chunks[sourceidx];
dst_children[destidx] = src_children[sourceidx];
}
}
static inline RT_PTR_ALLOC *
RT_ADD_CHILD_256(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk)
{
RT_NODE_256 *n256 = (RT_NODE_256 *) node.local;
int idx = RT_BM_IDX(chunk);
int bitnum = RT_BM_BIT(chunk);
/* Mark the slot used for "chunk" */
n256->isset[idx] |= ((bitmapword) 1 << bitnum);
n256->base.count++;
RT_VERIFY_NODE((RT_NODE *) n256);
return RT_NODE_256_GET_CHILD(n256, chunk);
}
static pg_noinline RT_PTR_ALLOC *
RT_GROW_NODE_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node,
uint8 chunk)
{
RT_NODE_48 *n48 = (RT_NODE_48 *) node.local;
RT_CHILD_PTR newnode;
RT_NODE_256 *new256;
int i = 0;
/* initialize new node */
newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_256, RT_CLASS_256);
new256 = (RT_NODE_256 *) newnode.local;
/* copy over the entries */
RT_COPY_COMMON(newnode, node);
for (int word_num = 0; word_num < RT_BM_IDX(RT_NODE_MAX_SLOTS); word_num++)
{
bitmapword bitmap = 0;
/*
* Bit manipulation is a surprisingly large portion of the overhead in
* the naive implementation. Doing stores word-at-a-time removes a lot
* of that overhead.
*/
for (int bit = 0; bit < BITS_PER_BITMAPWORD; bit++)
{
uint8 offset = n48->slot_idxs[i];
if (offset != RT_INVALID_SLOT_IDX)
{
bitmap |= ((bitmapword) 1 << bit);
new256->children[i] = n48->children[offset];
}
i++;
}
new256->isset[word_num] = bitmap;
}
/* free old node and update reference in parent */
*parent_slot = newnode.alloc;
RT_FREE_NODE(tree, node);
return RT_ADD_CHILD_256(tree, newnode, chunk);
}
static inline RT_PTR_ALLOC *
RT_ADD_CHILD_48(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk)
{
RT_NODE_48 *n48 = (RT_NODE_48 *) node.local;
int insertpos;
int idx = 0;
bitmapword w,
inverse;
/* get the first word with at least one bit not set */
for (int i = 0; i < RT_BM_IDX(RT_FANOUT_48_MAX); i++)
{
w = n48->isset[i];
if (w < ~((bitmapword) 0))
{
idx = i;
break;
}
}
/* To get the first unset bit in w, get the first set bit in ~w */
inverse = ~w;
insertpos = idx * BITS_PER_BITMAPWORD;
insertpos += bmw_rightmost_one_pos(inverse);
Assert(insertpos < n48->base.fanout);
/* mark the slot used by setting the rightmost zero bit */
n48->isset[idx] |= w + 1;
/* insert new chunk into place */
n48->slot_idxs[chunk] = insertpos;
n48->base.count++;
RT_VERIFY_NODE((RT_NODE *) n48);
return &n48->children[insertpos];
}
static pg_noinline RT_PTR_ALLOC *
RT_GROW_NODE_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node,
uint8 chunk)
{
RT_NODE_16 *n16 = (RT_NODE_16 *) node.local;
int insertpos;
if (n16->base.fanout < RT_FANOUT_16_HI)
{
RT_CHILD_PTR newnode;
RT_NODE_16 *new16;
Assert(n16->base.fanout == RT_FANOUT_16_LO);
/* initialize new node */
newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_16, RT_CLASS_16_HI);
new16 = (RT_NODE_16 *) newnode.local;
/* copy over existing entries */
RT_COPY_COMMON(newnode, node);
Assert(n16->base.count == RT_FANOUT_16_LO);
insertpos = RT_NODE_16_GET_INSERTPOS(n16, chunk);
RT_COPY_ARRAYS_FOR_INSERT(new16->chunks, new16->children,
n16->chunks, n16->children,
RT_FANOUT_16_LO, insertpos);
/* insert new chunk into place */
new16->chunks[insertpos] = chunk;
new16->base.count++;
RT_VERIFY_NODE((RT_NODE *) new16);
/* free old node and update references */
RT_FREE_NODE(tree, node);
*parent_slot = newnode.alloc;
return &new16->children[insertpos];
}
else
{
RT_CHILD_PTR newnode;
RT_NODE_48 *new48;
int idx,
bit;
Assert(n16->base.fanout == RT_FANOUT_16_HI);
/* initialize new node */
newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_48, RT_CLASS_48);
new48 = (RT_NODE_48 *) newnode.local;
/* copy over the entries */
RT_COPY_COMMON(newnode, node);
for (int i = 0; i < RT_FANOUT_16_HI; i++)
new48->slot_idxs[n16->chunks[i]] = i;
memcpy(&new48->children[0], &n16->children[0], RT_FANOUT_16_HI * sizeof(new48->children[0]));
/*
* Since we just copied a dense array, we can fill "isset" using a
* single store, provided the length of that array is at most the
* number of bits in a bitmapword.
*/
Assert(RT_FANOUT_16_HI <= BITS_PER_BITMAPWORD);
new48->isset[0] = (bitmapword) (((uint64) 1 << RT_FANOUT_16_HI) - 1);
/* put the new child at the end of the copied entries */
insertpos = RT_FANOUT_16_HI;
idx = RT_BM_IDX(insertpos);
bit = RT_BM_BIT(insertpos);
/* mark the slot used */
new48->isset[idx] |= ((bitmapword) 1 << bit);
/* insert new chunk into place */
new48->slot_idxs[chunk] = insertpos;
new48->base.count++;
RT_VERIFY_NODE((RT_NODE *) new48);
/* free old node and update reference in parent */
*parent_slot = newnode.alloc;
RT_FREE_NODE(tree, node);
return &new48->children[insertpos];
}
}
static inline RT_PTR_ALLOC *
RT_ADD_CHILD_16(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk)
{
RT_NODE_16 *n16 = (RT_NODE_16 *) node.local;
int insertpos = RT_NODE_16_GET_INSERTPOS(n16, chunk);
/* shift chunks and children */
RT_SHIFT_ARRAYS_FOR_INSERT(n16->chunks, n16->children,
n16->base.count, insertpos);
/* insert new chunk into place */
n16->chunks[insertpos] = chunk;
n16->base.count++;
RT_VERIFY_NODE((RT_NODE *) n16);
return &n16->children[insertpos];
}
static pg_noinline RT_PTR_ALLOC *
RT_GROW_NODE_4(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node,
uint8 chunk)
{
RT_NODE_4 *n4 = (RT_NODE_4 *) (node.local);
RT_CHILD_PTR newnode;
RT_NODE_16 *new16;
int insertpos;
/* initialize new node */
newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_16, RT_CLASS_16_LO);
new16 = (RT_NODE_16 *) newnode.local;
/* copy over existing entries */
RT_COPY_COMMON(newnode, node);
Assert(n4->base.count == RT_FANOUT_4);
insertpos = RT_NODE_4_GET_INSERTPOS(n4, chunk, RT_FANOUT_4);
RT_COPY_ARRAYS_FOR_INSERT(new16->chunks, new16->children,
n4->chunks, n4->children,
RT_FANOUT_4, insertpos);
/* insert new chunk into place */
new16->chunks[insertpos] = chunk;
new16->base.count++;
RT_VERIFY_NODE((RT_NODE *) new16);
/* free old node and update reference in parent */
*parent_slot = newnode.alloc;
RT_FREE_NODE(tree, node);
return &new16->children[insertpos];
}
static inline RT_PTR_ALLOC *
RT_ADD_CHILD_4(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk)
{
RT_NODE_4 *n4 = (RT_NODE_4 *) (node.local);
int count = n4->base.count;
int insertpos = RT_NODE_4_GET_INSERTPOS(n4, chunk, count);
/* shift chunks and children */
RT_SHIFT_ARRAYS_FOR_INSERT(n4->chunks, n4->children,
count, insertpos);
/* insert new chunk into place */
n4->chunks[insertpos] = chunk;
n4->base.count++;
RT_VERIFY_NODE((RT_NODE *) n4);
return &n4->children[insertpos];
}
/*
* Reserve slot in "node"'s child array. The caller will populate it
* with the actual child pointer.
*
* "parent_slot" is the address of the parent's child pointer to "node".
* If the node we're inserting into needs to grow, we update the parent's
* child pointer with the pointer to the new larger node.
*/
static inline RT_PTR_ALLOC *
RT_NODE_INSERT(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node,
uint8 chunk)
{
RT_NODE *n = node.local;
switch (n->kind)
{
case RT_NODE_KIND_4:
{
if (unlikely(RT_NODE_MUST_GROW(n)))
return RT_GROW_NODE_4(tree, parent_slot, node, chunk);
return RT_ADD_CHILD_4(tree, node, chunk);
}
case RT_NODE_KIND_16:
{
if (unlikely(RT_NODE_MUST_GROW(n)))
return RT_GROW_NODE_16(tree, parent_slot, node, chunk);
return RT_ADD_CHILD_16(tree, node, chunk);
}
case RT_NODE_KIND_48:
{
if (unlikely(RT_NODE_MUST_GROW(n)))
return RT_GROW_NODE_48(tree, parent_slot, node, chunk);
return RT_ADD_CHILD_48(tree, node, chunk);
}
case RT_NODE_KIND_256:
return RT_ADD_CHILD_256(tree, node, chunk);
default:
pg_unreachable();
}
}
/*
* The radix tree doesn't have sufficient height. Put new node(s) on top,
* and move the old node below it.
*/
static pg_noinline void
RT_EXTEND_UP(RT_RADIX_TREE * tree, uint64 key)
{
int target_shift = RT_KEY_GET_SHIFT(key);
int shift = tree->ctl->start_shift;
Assert(shift < target_shift);
/* Grow tree upwards until start shift can accommodate the key */
while (shift < target_shift)
{
RT_CHILD_PTR node;
RT_NODE_4 *n4;
node = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4);
n4 = (RT_NODE_4 *) node.local;
n4->base.count = 1;
n4->chunks[0] = 0;
n4->children[0] = tree->ctl->root;
/* Update the root */
tree->ctl->root = node.alloc;
shift += RT_SPAN;
}
tree->ctl->max_val = RT_SHIFT_GET_MAX_VAL(target_shift);
tree->ctl->start_shift = target_shift;
}
/*
* Insert a chain of nodes until we reach the lowest level,
* and return the address of a slot to be filled further up
* the call stack.
*/
static pg_noinline RT_PTR_ALLOC *
RT_EXTEND_DOWN(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, uint64 key, int shift)
{
RT_CHILD_PTR node,
child;
RT_NODE_4 *n4;
/*
* The child pointer of the first node in the chain goes in the
* caller-provided slot.
*/
child = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4);
*parent_slot = child.alloc;
node = child;
shift -= RT_SPAN;
while (shift > 0)
{
child = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4);
/* We open-code the insertion ourselves, for speed. */
n4 = (RT_NODE_4 *) node.local;
n4->base.count = 1;
n4->chunks[0] = RT_GET_KEY_CHUNK(key, shift);
n4->children[0] = child.alloc;
node = child;
shift -= RT_SPAN;
}
Assert(shift == 0);
/* Reserve slot for the value. */
n4 = (RT_NODE_4 *) node.local;
n4->chunks[0] = RT_GET_KEY_CHUNK(key, 0);
n4->base.count = 1;
return &n4->children[0];
}
/*
* Workhorse for RT_SET
*
* "parent_slot" is the address of the child pointer we just followed,
* in the parent's array of children, needed if inserting into the
* current node causes it to grow.
*/
static RT_PTR_ALLOC *
RT_GET_SLOT_RECURSIVE(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, uint64 key, int shift, bool *found)
{
RT_PTR_ALLOC *slot;
RT_CHILD_PTR node;
uint8 chunk = RT_GET_KEY_CHUNK(key, shift);
node.alloc = *parent_slot;
RT_PTR_SET_LOCAL(tree, &node);
slot = RT_NODE_SEARCH(node.local, chunk);
if (slot == NULL)
{
*found = false;
/* reserve slot for the caller to populate */
slot = RT_NODE_INSERT(tree, parent_slot, node, chunk);
if (shift == 0)
return slot;
else
return RT_EXTEND_DOWN(tree, slot, key, shift);
}
else
{
if (shift == 0)
{
*found = true;
return slot;
}
else
return RT_GET_SLOT_RECURSIVE(tree, slot, key, shift - RT_SPAN, found);
}
}
/*
* Set key to value that value_p points to. If the entry already exists, we
* update its value and return true. Returns false if entry doesn't yet exist.
*
* Taking a lock in exclusive mode is the caller's responsibility.
*/
RT_SCOPE bool
RT_SET(RT_RADIX_TREE * tree, uint64 key, RT_VALUE_TYPE * value_p)
{
bool found;
RT_PTR_ALLOC *slot;
size_t value_sz = RT_GET_VALUE_SIZE(value_p);
#ifdef RT_SHMEM
Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
#endif
Assert(RT_PTR_ALLOC_IS_VALID(tree->ctl->root));
/* Extend the tree if necessary */
if (unlikely(key > tree->ctl->max_val))
{
if (tree->ctl->num_keys == 0)
{
RT_CHILD_PTR node;
RT_NODE_4 *n4;
int start_shift = RT_KEY_GET_SHIFT(key);
/*
* With an empty root node, we don't extend the tree upwards,
* since that would result in orphan empty nodes. Instead we open
* code inserting into the root node and extend downward from
* there.
*/
node.alloc = tree->ctl->root;
RT_PTR_SET_LOCAL(tree, &node);
n4 = (RT_NODE_4 *) node.local;
n4->base.count = 1;
n4->chunks[0] = RT_GET_KEY_CHUNK(key, start_shift);
slot = RT_EXTEND_DOWN(tree, &n4->children[0], key, start_shift);
found = false;
tree->ctl->start_shift = start_shift;
tree->ctl->max_val = RT_SHIFT_GET_MAX_VAL(start_shift);
goto have_slot;
}
else
RT_EXTEND_UP(tree, key);
}
slot = RT_GET_SLOT_RECURSIVE(tree, &tree->ctl->root,
key, tree->ctl->start_shift, &found);
have_slot:
Assert(slot != NULL);
if (RT_VALUE_IS_EMBEDDABLE(value_p))
{
/* free the existing leaf */
if (found && !RT_CHILDPTR_IS_VALUE(*slot))
RT_FREE_LEAF(tree, *slot);
/* store value directly in child pointer slot */
memcpy(slot, value_p, value_sz);
#ifdef RT_RUNTIME_EMBEDDABLE_VALUE
/* tag child pointer */
#ifdef RT_SHMEM
*slot |= 1;
#else
*((uintptr_t *) slot) |= 1;
#endif
#endif
}
else
{
RT_CHILD_PTR leaf;
if (found && !RT_CHILDPTR_IS_VALUE(*slot))
{
Assert(RT_PTR_ALLOC_IS_VALID(*slot));
leaf.alloc = *slot;
RT_PTR_SET_LOCAL(tree, &leaf);
if (RT_GET_VALUE_SIZE((RT_VALUE_TYPE *) leaf.local) != value_sz)
{
/*
* different sizes, so first free the existing leaf before
* allocating a new one
*/
RT_FREE_LEAF(tree, *slot);
leaf = RT_ALLOC_LEAF(tree, value_sz);
*slot = leaf.alloc;
}
}
else
{
/* allocate new leaf and store it in the child array */
leaf = RT_ALLOC_LEAF(tree, value_sz);
*slot = leaf.alloc;
}
memcpy(leaf.local, value_p, value_sz);
}
/* Update the statistics */
if (!found)
tree->ctl->num_keys++;
return found;
}
/***************** SETUP / TEARDOWN *****************/
/*
* Create the radix tree in the given memory context and return it.
*
* All local memory required for a radix tree is allocated in the given
* memory context and its children. Note that RT_FREE() will delete all
* allocated space within the given memory context, so the dsa_area should
* be created in a different context.
*/
RT_SCOPE RT_RADIX_TREE *
#ifdef RT_SHMEM
RT_CREATE(MemoryContext ctx, dsa_area *dsa, int tranche_id)
#else
RT_CREATE(MemoryContext ctx)
#endif
{
RT_RADIX_TREE *tree;
MemoryContext old_ctx;
RT_CHILD_PTR rootnode;
#ifdef RT_SHMEM
dsa_pointer dp;
#endif
old_ctx = MemoryContextSwitchTo(ctx);
tree = (RT_RADIX_TREE *) palloc0(sizeof(RT_RADIX_TREE));
tree->context = ctx;
/*
* Separate context for iteration in case the tree context doesn't support
* pfree
*/
tree->iter_context = AllocSetContextCreate(ctx,
RT_STR(RT_PREFIX) "_radix_tree iter context",
ALLOCSET_SMALL_SIZES);
#ifdef RT_SHMEM
tree->dsa = dsa;
dp = dsa_allocate0(dsa, sizeof(RT_RADIX_TREE_CONTROL));
tree->ctl = (RT_RADIX_TREE_CONTROL *) dsa_get_address(dsa, dp);
tree->ctl->handle = dp;
tree->ctl->magic = RT_RADIX_TREE_MAGIC;
LWLockInitialize(&tree->ctl->lock, tranche_id);
#else
tree->ctl = (RT_RADIX_TREE_CONTROL *) palloc0(sizeof(RT_RADIX_TREE_CONTROL));
/* Create a slab context for each size class */
for (int i = 0; i < RT_NUM_SIZE_CLASSES; i++)
{
RT_SIZE_CLASS_ELEM size_class = RT_SIZE_CLASS_INFO[i];
size_t inner_blocksize = RT_SLAB_BLOCK_SIZE(size_class.allocsize);
tree->node_slabs[i] = SlabContextCreate(ctx,
size_class.name,
inner_blocksize,
size_class.allocsize);
}
/* By default we use the passed context for leaves. */
tree->leaf_context = tree->context;
#ifndef RT_VARLEN_VALUE_SIZE
/*
* For leaves storing fixed-length values, we use a slab context to avoid
* the possibility of space wastage by power-of-2 rounding up.
*/
if (sizeof(RT_VALUE_TYPE) > sizeof(RT_PTR_ALLOC))
tree->leaf_context = SlabContextCreate(ctx,
RT_STR(RT_PREFIX) "_radix_tree leaf context",
RT_SLAB_BLOCK_SIZE(sizeof(RT_VALUE_TYPE)),
sizeof(RT_VALUE_TYPE));
#endif /* !RT_VARLEN_VALUE_SIZE */
#endif /* RT_SHMEM */
/* add root node now so that RT_SET can assume it exists */
rootnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4);
tree->ctl->root = rootnode.alloc;
tree->ctl->start_shift = 0;
tree->ctl->max_val = RT_SHIFT_GET_MAX_VAL(0);
MemoryContextSwitchTo(old_ctx);
return tree;
}
#ifdef RT_SHMEM
RT_SCOPE RT_RADIX_TREE *
RT_ATTACH(dsa_area *dsa, RT_HANDLE handle)
{
RT_RADIX_TREE *tree;
dsa_pointer control;
tree = (RT_RADIX_TREE *) palloc0(sizeof(RT_RADIX_TREE));
/* Find the control object in shared memory */
control = handle;
tree->dsa = dsa;
tree->ctl = (RT_RADIX_TREE_CONTROL *) dsa_get_address(dsa, control);
Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
return tree;
}
RT_SCOPE void
RT_DETACH(RT_RADIX_TREE * tree)
{
Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
pfree(tree);
}
RT_SCOPE RT_HANDLE
RT_GET_HANDLE(RT_RADIX_TREE * tree)
{
Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
return tree->ctl->handle;
}
RT_SCOPE void
RT_LOCK_EXCLUSIVE(RT_RADIX_TREE * tree)
{
Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
LWLockAcquire(&tree->ctl->lock, LW_EXCLUSIVE);
}
RT_SCOPE void
RT_LOCK_SHARE(RT_RADIX_TREE * tree)
{
Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
LWLockAcquire(&tree->ctl->lock, LW_SHARED);
}
RT_SCOPE void
RT_UNLOCK(RT_RADIX_TREE * tree)
{
Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
LWLockRelease(&tree->ctl->lock);
}
/*
* Recursively free all nodes allocated in the DSA area.
*/
static void
RT_FREE_RECURSE(RT_RADIX_TREE * tree, RT_PTR_ALLOC ptr, int shift)
{
RT_CHILD_PTR node;
check_stack_depth();
node.alloc = ptr;
RT_PTR_SET_LOCAL(tree, &node);
switch (node.local->kind)
{
case RT_NODE_KIND_4:
{
RT_NODE_4 *n4 = (RT_NODE_4 *) node.local;
for (int i = 0; i < n4->base.count; i++)
{
RT_PTR_ALLOC child = n4->children[i];
if (shift > 0)
RT_FREE_RECURSE(tree, child, shift - RT_SPAN);
else if (!RT_CHILDPTR_IS_VALUE(child))
dsa_free(tree->dsa, child);
}
break;
}
case RT_NODE_KIND_16:
{
RT_NODE_16 *n16 = (RT_NODE_16 *) node.local;
for (int i = 0; i < n16->base.count; i++)
{
RT_PTR_ALLOC child = n16->children[i];
if (shift > 0)
RT_FREE_RECURSE(tree, child, shift - RT_SPAN);
else if (!RT_CHILDPTR_IS_VALUE(child))
dsa_free(tree->dsa, child);
}
break;
}
case RT_NODE_KIND_48:
{
RT_NODE_48 *n48 = (RT_NODE_48 *) node.local;
for (int i = 0; i < RT_NODE_MAX_SLOTS; i++)
{
RT_PTR_ALLOC child;
if (!RT_NODE_48_IS_CHUNK_USED(n48, i))
continue;
child = *RT_NODE_48_GET_CHILD(n48, i);
if (shift > 0)
RT_FREE_RECURSE(tree, child, shift - RT_SPAN);
else if (!RT_CHILDPTR_IS_VALUE(child))
dsa_free(tree->dsa, child);
}
break;
}
case RT_NODE_KIND_256:
{
RT_NODE_256 *n256 = (RT_NODE_256 *) node.local;
for (int i = 0; i < RT_NODE_MAX_SLOTS; i++)
{
RT_PTR_ALLOC child;
if (!RT_NODE_256_IS_CHUNK_USED(n256, i))
continue;
child = *RT_NODE_256_GET_CHILD(n256, i);
if (shift > 0)
RT_FREE_RECURSE(tree, child, shift - RT_SPAN);
else if (!RT_CHILDPTR_IS_VALUE(child))
dsa_free(tree->dsa, child);
}
break;
}
}
/* Free the inner node */
dsa_free(tree->dsa, ptr);
}
#endif
/*
* Free the radix tree, including all nodes and leaves.
*/
RT_SCOPE void
RT_FREE(RT_RADIX_TREE * tree)
{
#ifdef RT_SHMEM
Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
/* Free all memory used for radix tree nodes */
Assert(RT_PTR_ALLOC_IS_VALID(tree->ctl->root));
RT_FREE_RECURSE(tree, tree->ctl->root, tree->ctl->start_shift);
/*
* Vandalize the control block to help catch programming error where other
* backends access the memory formerly occupied by this radix tree.
*/
tree->ctl->magic = 0;
dsa_free(tree->dsa, tree->ctl->handle);
#endif
/*
* Free all space allocated within the tree's context and delete all child
* contexts such as those used for nodes.
*/
MemoryContextReset(tree->context);
}
/***************** ITERATION *****************/
/*
* Create and return the iterator for the given radix tree.
*
* Taking a lock in shared mode during the iteration is the caller's
* responsibility.
*/
RT_SCOPE RT_ITER *
RT_BEGIN_ITERATE(RT_RADIX_TREE * tree)
{
RT_ITER *iter;
RT_CHILD_PTR root;
iter = (RT_ITER *) MemoryContextAllocZero(tree->iter_context,
sizeof(RT_ITER));
iter->tree = tree;
Assert(RT_PTR_ALLOC_IS_VALID(tree->ctl->root));
root.alloc = iter->tree->ctl->root;
RT_PTR_SET_LOCAL(tree, &root);
iter->top_level = iter->tree->ctl->start_shift / RT_SPAN;
/* Set the root to start */
iter->cur_level = iter->top_level;
iter->node_iters[iter->cur_level].node = root;
iter->node_iters[iter->cur_level].idx = 0;
return iter;
}
/*
* Scan the inner node and return the next child pointer if one exists, otherwise
* return NULL.
*/
static inline RT_PTR_ALLOC *
RT_NODE_ITERATE_NEXT(RT_ITER * iter, int level)
{
uint8 key_chunk = 0;
RT_NODE_ITER *node_iter;
RT_CHILD_PTR node;
RT_PTR_ALLOC *slot = NULL;
#ifdef RT_SHMEM
Assert(iter->tree->ctl->magic == RT_RADIX_TREE_MAGIC);
#endif
node_iter = &(iter->node_iters[level]);
node = node_iter->node;
Assert(node.local != NULL);
switch ((node.local)->kind)
{
case RT_NODE_KIND_4:
{
RT_NODE_4 *n4 = (RT_NODE_4 *) (node.local);
if (node_iter->idx >= n4->base.count)
return NULL;
slot = &n4->children[node_iter->idx];
key_chunk = n4->chunks[node_iter->idx];
node_iter->idx++;
break;
}
case RT_NODE_KIND_16:
{
RT_NODE_16 *n16 = (RT_NODE_16 *) (node.local);
if (node_iter->idx >= n16->base.count)
return NULL;
slot = &n16->children[node_iter->idx];
key_chunk = n16->chunks[node_iter->idx];
node_iter->idx++;
break;
}
case RT_NODE_KIND_48:
{
RT_NODE_48 *n48 = (RT_NODE_48 *) (node.local);
int chunk;
for (chunk = node_iter->idx; chunk < RT_NODE_MAX_SLOTS; chunk++)
{
if (RT_NODE_48_IS_CHUNK_USED(n48, chunk))
break;
}
if (chunk >= RT_NODE_MAX_SLOTS)
return NULL;
slot = RT_NODE_48_GET_CHILD(n48, chunk);
key_chunk = chunk;
node_iter->idx = chunk + 1;
break;
}
case RT_NODE_KIND_256:
{
RT_NODE_256 *n256 = (RT_NODE_256 *) (node.local);
int chunk;
for (chunk = node_iter->idx; chunk < RT_NODE_MAX_SLOTS; chunk++)
{
if (RT_NODE_256_IS_CHUNK_USED(n256, chunk))
break;
}
if (chunk >= RT_NODE_MAX_SLOTS)
return NULL;
slot = RT_NODE_256_GET_CHILD(n256, chunk);
key_chunk = chunk;
node_iter->idx = chunk + 1;
break;
}
}
/* Update the key */
iter->key &= ~(((uint64) RT_CHUNK_MASK) << (level * RT_SPAN));
iter->key |= (((uint64) key_chunk) << (level * RT_SPAN));
return slot;
}
/*
* Return pointer to value and set key_p as long as there is a key. Otherwise
* return NULL.
*/
RT_SCOPE RT_VALUE_TYPE *
RT_ITERATE_NEXT(RT_ITER * iter, uint64 *key_p)
{
RT_PTR_ALLOC *slot = NULL;
while (iter->cur_level <= iter->top_level)
{
RT_CHILD_PTR node;
slot = RT_NODE_ITERATE_NEXT(iter, iter->cur_level);
if (iter->cur_level == 0 && slot != NULL)
{
/* Found a value at the leaf node */
*key_p = iter->key;
node.alloc = *slot;
if (RT_CHILDPTR_IS_VALUE(*slot))
return (RT_VALUE_TYPE *) slot;
else
{
RT_PTR_SET_LOCAL(iter->tree, &node);
return (RT_VALUE_TYPE *) node.local;
}
}
if (slot != NULL)
{
/* Found the child slot, move down the tree */
node.alloc = *slot;
RT_PTR_SET_LOCAL(iter->tree, &node);
iter->cur_level--;
iter->node_iters[iter->cur_level].node = node;
iter->node_iters[iter->cur_level].idx = 0;
}
else
{
/* Not found the child slot, move up the tree */
iter->cur_level++;
}
}
/* We've visited all nodes, so the iteration finished */
return NULL;
}
/*
* Terminate the iteration. The caller is responsible for releasing any locks.
*/
RT_SCOPE void
RT_END_ITERATE(RT_ITER * iter)
{
pfree(iter);
}
/***************** DELETION *****************/
#ifdef RT_USE_DELETE
/* Delete the element at "deletepos" */
static inline void
RT_SHIFT_ARRAYS_AND_DELETE(uint8 *chunks, RT_PTR_ALLOC * children, int count, int deletepos)
{
/*
* This is basically a memmove, but written in a simple loop for speed on
* small inputs.
*/
for (int i = deletepos; i < count - 1; i++)
{
/* workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101481 */
#ifdef __GNUC__
__asm__("");
#endif
chunks[i] = chunks[i + 1];
children[i] = children[i + 1];
}
}
/*
* Copy both chunk and slot arrays into the right
* place. The element at "deletepos" is deleted by skipping it.
*/
static inline void
RT_COPY_ARRAYS_AND_DELETE(uint8 *dst_chunks, RT_PTR_ALLOC * dst_children,
uint8 *src_chunks, RT_PTR_ALLOC * src_children,
int count, int deletepos)
{
for (int i = 0; i < count - 1; i++)
{
/*
* use a branch-free computation to skip the index of the deleted
* element
*/
int sourceidx = i + (i >= deletepos);
int destidx = i;
dst_chunks[destidx] = src_chunks[sourceidx];
dst_children[destidx] = src_children[sourceidx];
}
}
/*
* Note: While all node-growing functions are called to perform an insertion
* when no more space is available, shrinking is not a hard-and-fast requirement.
* When shrinking nodes, we generally wait until the count is about 3/4 of
* the next lower node's fanout. This prevents ping-ponging between different
* node sizes.
*
* Some shrinking functions delete first and then shrink, either because we
* must or because it's fast and simple that way. Sometimes it's faster to
* delete while shrinking.
*/
/*
* Move contents of a node256 to a node48. Any deletion should have happened
* in the caller.
*/
static void pg_noinline
RT_SHRINK_NODE_256(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk)
{
RT_NODE_256 *n256 = (RT_NODE_256 *) node.local;
RT_CHILD_PTR newnode;
RT_NODE_48 *new48;
int slot_idx = 0;
/* initialize new node */
newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_48, RT_CLASS_48);
new48 = (RT_NODE_48 *) newnode.local;
/* copy over the entries */
RT_COPY_COMMON(newnode, node);
for (int i = 0; i < RT_NODE_MAX_SLOTS; i++)
{
if (RT_NODE_256_IS_CHUNK_USED(n256, i))
{
new48->slot_idxs[i] = slot_idx;
new48->children[slot_idx] = n256->children[i];
slot_idx++;
}
}
/*
* Since we just copied a dense array, we can fill "isset" using a single
* store, provided the length of that array is at most the number of bits
* in a bitmapword.
*/
Assert(n256->base.count <= BITS_PER_BITMAPWORD);
new48->isset[0] = (bitmapword) (((uint64) 1 << n256->base.count) - 1);
/* free old node and update reference in parent */
*parent_slot = newnode.alloc;
RT_FREE_NODE(tree, node);
}
static inline void
RT_REMOVE_CHILD_256(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk)
{
int shrink_threshold;
RT_NODE_256 *n256 = (RT_NODE_256 *) node.local;
int idx = RT_BM_IDX(chunk);
int bitnum = RT_BM_BIT(chunk);
/* Mark the slot free for "chunk" */
n256->isset[idx] &= ~((bitmapword) 1 << bitnum);
n256->base.count--;
/*
* A full node256 will have a count of zero because of overflow, so we
* delete first before checking the shrink threshold.
*/
Assert(n256->base.count > 0);
/* This simplifies RT_SHRINK_NODE_256() */
shrink_threshold = BITS_PER_BITMAPWORD;
shrink_threshold = Min(RT_FANOUT_48 / 4 * 3, shrink_threshold);
if (n256->base.count <= shrink_threshold)
RT_SHRINK_NODE_256(tree, parent_slot, node, chunk);
}
/*
* Move contents of a node48 to a node16. Any deletion should have happened
* in the caller.
*/
static void pg_noinline
RT_SHRINK_NODE_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk)
{
RT_NODE_48 *n48 = (RT_NODE_48 *) (node.local);
RT_CHILD_PTR newnode;
RT_NODE_16 *new16;
int destidx = 0;
/*
* Initialize new node. For now we skip the larger node16 size class for
* simplicity.
*/
newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_16, RT_CLASS_16_LO);
new16 = (RT_NODE_16 *) newnode.local;
/* copy over all existing entries */
RT_COPY_COMMON(newnode, node);
for (int chunk = 0; chunk < RT_NODE_MAX_SLOTS; chunk++)
{
if (n48->slot_idxs[chunk] != RT_INVALID_SLOT_IDX)
{
new16->chunks[destidx] = chunk;
new16->children[destidx] = n48->children[n48->slot_idxs[chunk]];
destidx++;
}
}
Assert(destidx < new16->base.fanout);
RT_VERIFY_NODE((RT_NODE *) new16);
/* free old node and update reference in parent */
*parent_slot = newnode.alloc;
RT_FREE_NODE(tree, node);
}
static inline void
RT_REMOVE_CHILD_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk)
{
RT_NODE_48 *n48 = (RT_NODE_48 *) node.local;
int deletepos = n48->slot_idxs[chunk];
/* For now we skip the larger node16 size class for simplicity */
int shrink_threshold = RT_FANOUT_16_LO / 4 * 3;
int idx;
int bitnum;
Assert(deletepos != RT_INVALID_SLOT_IDX);
idx = RT_BM_IDX(deletepos);
bitnum = RT_BM_BIT(deletepos);
n48->isset[idx] &= ~((bitmapword) 1 << bitnum);
n48->slot_idxs[chunk] = RT_INVALID_SLOT_IDX;
n48->base.count--;
/*
* To keep shrinking simple, do it after deleting, which is fast for
* node48 anyway.
*/
if (n48->base.count <= shrink_threshold)
RT_SHRINK_NODE_48(tree, parent_slot, node, chunk);
}
/*
* Move contents of a node16 to a node4, and delete the one at "deletepos".
* By deleting as we move, we can avoid memmove operations in the new
* node.
*/
static void pg_noinline
RT_SHRINK_NODE_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 deletepos)
{
RT_NODE_16 *n16 = (RT_NODE_16 *) (node.local);
RT_CHILD_PTR newnode;
RT_NODE_4 *new4;
/* initialize new node */
newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4);
new4 = (RT_NODE_4 *) newnode.local;
/* copy over existing entries, except for the one at "deletepos" */
RT_COPY_COMMON(newnode, node);
RT_COPY_ARRAYS_AND_DELETE(new4->chunks, new4->children,
n16->chunks, n16->children,
n16->base.count, deletepos);
new4->base.count--;
RT_VERIFY_NODE((RT_NODE *) new4);
/* free old node and update reference in parent */
*parent_slot = newnode.alloc;
RT_FREE_NODE(tree, node);
}
static inline void
RT_REMOVE_CHILD_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk, RT_PTR_ALLOC * slot)
{
RT_NODE_16 *n16 = (RT_NODE_16 *) node.local;
int deletepos = slot - n16->children;
/*
* When shrinking to node4, 4 is hard-coded. After shrinking, the new node
* will end up with 3 elements and 3 is the largest count where linear
* search is faster than SIMD, at least on x86-64.
*/
if (n16->base.count <= 4)
{
RT_SHRINK_NODE_16(tree, parent_slot, node, deletepos);
return;
}
Assert(deletepos >= 0);
Assert(n16->chunks[deletepos] == chunk);
RT_SHIFT_ARRAYS_AND_DELETE(n16->chunks, n16->children,
n16->base.count, deletepos);
n16->base.count--;
}
static inline void
RT_REMOVE_CHILD_4(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk, RT_PTR_ALLOC * slot)
{
RT_NODE_4 *n4 = (RT_NODE_4 *) node.local;
if (n4->base.count == 1)
{
Assert(n4->chunks[0] == chunk);
/*
* If we're deleting the last entry from the root child node don't
* free it, but mark both the tree and the root child node empty. That
* way, RT_SET can assume it exists.
*/
if (parent_slot == &tree->ctl->root)
{
n4->base.count = 0;
tree->ctl->start_shift = 0;
tree->ctl->max_val = RT_SHIFT_GET_MAX_VAL(0);
}
else
{
/*
* Deleting last entry, so just free the entire node.
* RT_DELETE_RECURSIVE has already freed the value and lower-level
* children.
*/
RT_FREE_NODE(tree, node);
/*
* Also null out the parent's slot -- this tells the next higher
* level to delete its child pointer
*/
*parent_slot = RT_INVALID_PTR_ALLOC;
}
}
else
{
int deletepos = slot - n4->children;
Assert(deletepos >= 0);
Assert(n4->chunks[deletepos] == chunk);
RT_SHIFT_ARRAYS_AND_DELETE(n4->chunks, n4->children,
n4->base.count, deletepos);
n4->base.count--;
}
}
/*
* Delete the child pointer corresponding to "key" in the given node.
*/
static inline void
RT_NODE_DELETE(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk, RT_PTR_ALLOC * slot)
{
switch ((node.local)->kind)
{
case RT_NODE_KIND_4:
{
RT_REMOVE_CHILD_4(tree, parent_slot, node, chunk, slot);
return;
}
case RT_NODE_KIND_16:
{
RT_REMOVE_CHILD_16(tree, parent_slot, node, chunk, slot);
return;
}
case RT_NODE_KIND_48:
{
RT_REMOVE_CHILD_48(tree, parent_slot, node, chunk);
return;
}
case RT_NODE_KIND_256:
{
RT_REMOVE_CHILD_256(tree, parent_slot, node, chunk);
return;
}
default:
pg_unreachable();
}
}
/* workhorse for RT_DELETE */
static bool
RT_DELETE_RECURSIVE(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, uint64 key, int shift)
{
RT_PTR_ALLOC *slot;
RT_CHILD_PTR node;
uint8 chunk = RT_GET_KEY_CHUNK(key, shift);
node.alloc = *parent_slot;
RT_PTR_SET_LOCAL(tree, &node);
slot = RT_NODE_SEARCH(node.local, chunk);
if (slot == NULL)
return false;
if (shift == 0)
{
if (!RT_CHILDPTR_IS_VALUE(*slot))
RT_FREE_LEAF(tree, *slot);
RT_NODE_DELETE(tree, parent_slot, node, chunk, slot);
return true;
}
else
{
bool deleted;
deleted = RT_DELETE_RECURSIVE(tree, slot, key, shift - RT_SPAN);
/* Child node was freed, so delete its slot now */
if (*slot == RT_INVALID_PTR_ALLOC)
{
Assert(deleted);
RT_NODE_DELETE(tree, parent_slot, node, chunk, slot);
}
return deleted;
}
}
/*
* Delete the given key from the radix tree. If the key is found delete it
* and return true, otherwise do nothing and return false.
*
* Taking a lock in exclusive mode is the caller's responsibility.
*/
RT_SCOPE bool
RT_DELETE(RT_RADIX_TREE * tree, uint64 key)
{
bool deleted;
#ifdef RT_SHMEM
Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
#endif
if (key > tree->ctl->max_val)
return false;
Assert(RT_PTR_ALLOC_IS_VALID(tree->ctl->root));
deleted = RT_DELETE_RECURSIVE(tree, &tree->ctl->root,
key, tree->ctl->start_shift);
/* Found the key to delete. Update the statistics */
if (deleted)
{
tree->ctl->num_keys--;
Assert(tree->ctl->num_keys >= 0);
}
return deleted;
}
#endif /* RT_USE_DELETE */
/***************** UTILITY FUNCTIONS *****************/
/*
* Return the statistics of the amount of memory used by the radix tree.
*
* Since dsa_get_total_size() does appropriate locking, the caller doesn't
* need to take a lock.
*/
RT_SCOPE uint64
RT_MEMORY_USAGE(RT_RADIX_TREE * tree)
{
size_t total = 0;
#ifdef RT_SHMEM
Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC);
total = dsa_get_total_size(tree->dsa);
#else
total = MemoryContextMemAllocated(tree->context, true);
#endif
return total;
}
/*
* Perform some sanity checks on the given node.
*/
static void
RT_VERIFY_NODE(RT_NODE * node)
{
#ifdef USE_ASSERT_CHECKING
switch (node->kind)
{
case RT_NODE_KIND_4:
{
RT_NODE_4 *n4 = (RT_NODE_4 *) node;
/* RT_DUMP_NODE(node); */
for (int i = 1; i < n4->base.count; i++)
Assert(n4->chunks[i - 1] < n4->chunks[i]);
break;
}
case RT_NODE_KIND_16:
{
RT_NODE_16 *n16 = (RT_NODE_16 *) node;
/* RT_DUMP_NODE(node); */
for (int i = 1; i < n16->base.count; i++)
Assert(n16->chunks[i - 1] < n16->chunks[i]);
break;
}
case RT_NODE_KIND_48:
{
RT_NODE_48 *n48 = (RT_NODE_48 *) node;
int cnt = 0;
/* RT_DUMP_NODE(node); */
for (int i = 0; i < RT_NODE_MAX_SLOTS; i++)
{
uint8 slot = n48->slot_idxs[i];
int idx = RT_BM_IDX(slot);
int bitnum = RT_BM_BIT(slot);
if (!RT_NODE_48_IS_CHUNK_USED(n48, i))
continue;
/* Check if the corresponding slot is used */
Assert(slot < node->fanout);
Assert((n48->isset[idx] & ((bitmapword) 1 << bitnum)) != 0);
cnt++;
}
Assert(n48->base.count == cnt);
break;
}
case RT_NODE_KIND_256:
{
RT_NODE_256 *n256 = (RT_NODE_256 *) node;
int cnt = 0;
/* RT_DUMP_NODE(node); */
for (int i = 0; i < RT_BM_IDX(RT_NODE_MAX_SLOTS); i++)
cnt += bmw_popcount(n256->isset[i]);
/*
* Check if the number of used chunk matches, accounting for
* overflow
*/
if (cnt == RT_FANOUT_256)
Assert(n256->base.count == 0);
else
Assert(n256->base.count == cnt);
break;
}
}
#endif
}
/***************** DEBUG FUNCTIONS *****************/
#ifdef RT_DEBUG
/*
* Print out tree stats, some of which are only collected in debugging builds.
*/
RT_SCOPE void
RT_STATS(RT_RADIX_TREE * tree)
{
fprintf(stderr, "max_val = " UINT64_FORMAT "\n", tree->ctl->max_val);
fprintf(stderr, "num_keys = %lld\n", (long long) tree->ctl->num_keys);
#ifdef RT_SHMEM
fprintf(stderr, "handle = " DSA_POINTER_FORMAT "\n", tree->ctl->handle);
#endif
fprintf(stderr, "height = %d", tree->ctl->start_shift / RT_SPAN);
for (int i = 0; i < RT_NUM_SIZE_CLASSES; i++)
{
RT_SIZE_CLASS_ELEM size_class = RT_SIZE_CLASS_INFO[i];
fprintf(stderr, ", n%d = %lld", size_class.fanout, (long long) tree->ctl->num_nodes[i]);
}
fprintf(stderr, ", leaves = %lld", (long long) tree->ctl->num_leaves);
fprintf(stderr, "\n");
}
/*
* Print out debugging information about the given node.
*/
static void
pg_attribute_unused()
RT_DUMP_NODE(RT_NODE * node)
{
#ifdef RT_SHMEM
#define RT_CHILD_PTR_FORMAT DSA_POINTER_FORMAT
#else
#define RT_CHILD_PTR_FORMAT "%p"
#endif
fprintf(stderr, "kind %d, fanout %d, count %u\n",
(node->kind == RT_NODE_KIND_4) ? 4 :
(node->kind == RT_NODE_KIND_16) ? 16 :
(node->kind == RT_NODE_KIND_48) ? 48 : 256,
node->fanout == 0 ? 256 : node->fanout,
node->count == 0 ? 256 : node->count);
switch (node->kind)
{
case RT_NODE_KIND_4:
{
RT_NODE_4 *n4 = (RT_NODE_4 *) node;
fprintf(stderr, "chunks and slots:\n");
for (int i = 0; i < n4->base.count; i++)
{
fprintf(stderr, " [%d] chunk %x slot " RT_CHILD_PTR_FORMAT "\n",
i, n4->chunks[i], n4->children[i]);
}
break;
}
case RT_NODE_KIND_16:
{
RT_NODE_16 *n16 = (RT_NODE_16 *) node;
fprintf(stderr, "chunks and slots:\n");
for (int i = 0; i < n16->base.count; i++)
{
fprintf(stderr, " [%d] chunk %x slot " RT_CHILD_PTR_FORMAT "\n",
i, n16->chunks[i], n16->children[i]);
}
break;
}
case RT_NODE_KIND_48:
{
RT_NODE_48 *n48 = (RT_NODE_48 *) node;
char *sep = "";
fprintf(stderr, "slot_idxs: \n");
for (int chunk = 0; chunk < RT_NODE_MAX_SLOTS; chunk++)
{
if (!RT_NODE_48_IS_CHUNK_USED(n48, chunk))
continue;
fprintf(stderr, " idx[%d] = %d\n",
chunk, n48->slot_idxs[chunk]);
}
fprintf(stderr, "isset-bitmap: ");
for (int i = 0; i < (RT_FANOUT_48_MAX / BITS_PER_BYTE); i++)
{
fprintf(stderr, "%s%x", sep, ((uint8 *) n48->isset)[i]);
sep = " ";
}
fprintf(stderr, "\n");
fprintf(stderr, "chunks and slots:\n");
for (int chunk = 0; chunk < RT_NODE_MAX_SLOTS; chunk++)
{
if (!RT_NODE_48_IS_CHUNK_USED(n48, chunk))
continue;
fprintf(stderr, " chunk %x slot " RT_CHILD_PTR_FORMAT "\n",
chunk,
*RT_NODE_48_GET_CHILD(n48, chunk));
}
break;
}
case RT_NODE_KIND_256:
{
RT_NODE_256 *n256 = (RT_NODE_256 *) node;
char *sep = "";
fprintf(stderr, "isset-bitmap: ");
for (int i = 0; i < (RT_FANOUT_256 / BITS_PER_BYTE); i++)
{
fprintf(stderr, "%s%x", sep, ((uint8 *) n256->isset)[i]);
sep = " ";
}
fprintf(stderr, "\n");
fprintf(stderr, "chunks and slots:\n");
for (int chunk = 0; chunk < RT_NODE_MAX_SLOTS; chunk++)
{
if (!RT_NODE_256_IS_CHUNK_USED(n256, chunk))
continue;
fprintf(stderr, " chunk %x slot " RT_CHILD_PTR_FORMAT "\n",
chunk,
*RT_NODE_256_GET_CHILD(n256, chunk));
}
break;
}
}
}
#endif /* RT_DEBUG */
#endif /* RT_DEFINE */
/* undefine external parameters, so next radix tree can be defined */
#undef RT_PREFIX
#undef RT_SCOPE
#undef RT_DECLARE
#undef RT_DEFINE
#undef RT_VALUE_TYPE
#undef RT_VARLEN_VALUE_SIZE
#undef RT_RUNTIME_EMBEDDABLE_VALUE
#undef RT_SHMEM
#undef RT_USE_DELETE
#undef RT_DEBUG
/* locally declared macros */
#undef RT_MAKE_PREFIX
#undef RT_MAKE_NAME
#undef RT_MAKE_NAME_
#undef RT_STR
#undef RT_STR_
#undef RT_SPAN
#undef RT_NODE_MAX_SLOTS
#undef RT_CHUNK_MASK
#undef RT_MAX_SHIFT
#undef RT_MAX_LEVEL
#undef RT_GET_KEY_CHUNK
#undef RT_BM_IDX
#undef RT_BM_BIT
#undef RT_NODE_MUST_GROW
#undef RT_NODE_KIND_COUNT
#undef RT_NUM_SIZE_CLASSES
#undef RT_INVALID_SLOT_IDX
#undef RT_SLAB_BLOCK_SIZE
#undef RT_RADIX_TREE_MAGIC
#undef RT_CHILD_PTR_FORMAT
/* type declarations */
#undef RT_RADIX_TREE
#undef RT_RADIX_TREE_CONTROL
#undef RT_CHILD_PTR
#undef RT_PTR_ALLOC
#undef RT_INVALID_PTR_ALLOC
#undef RT_HANDLE
#undef RT_ITER
#undef RT_NODE
#undef RT_NODE_ITER
#undef RT_NODE_KIND_4
#undef RT_NODE_KIND_16
#undef RT_NODE_KIND_48
#undef RT_NODE_KIND_256
#undef RT_NODE_4
#undef RT_NODE_16
#undef RT_NODE_48
#undef RT_NODE_256
#undef RT_SIZE_CLASS
#undef RT_SIZE_CLASS_ELEM
#undef RT_SIZE_CLASS_INFO
#undef RT_CLASS_4
#undef RT_CLASS_16_LO
#undef RT_CLASS_16_HI
#undef RT_CLASS_48
#undef RT_CLASS_256
#undef RT_FANOUT_4
#undef RT_FANOUT_4_MAX
#undef RT_FANOUT_16_LO
#undef RT_FANOUT_16_HI
#undef RT_FANOUT_16_MAX
#undef RT_FANOUT_48
#undef RT_FANOUT_48_MAX
#undef RT_FANOUT_256
/* function declarations */
#undef RT_CREATE
#undef RT_FREE
#undef RT_ATTACH
#undef RT_DETACH
#undef RT_LOCK_EXCLUSIVE
#undef RT_LOCK_SHARE
#undef RT_UNLOCK
#undef RT_GET_HANDLE
#undef RT_FIND
#undef RT_SET
#undef RT_BEGIN_ITERATE
#undef RT_ITERATE_NEXT
#undef RT_END_ITERATE
#undef RT_DELETE
#undef RT_MEMORY_USAGE
#undef RT_DUMP_NODE
#undef RT_STATS
/* internal helper functions */
#undef RT_GET_VALUE_SIZE
#undef RT_VALUE_IS_EMBEDDABLE
#undef RT_CHILDPTR_IS_VALUE
#undef RT_GET_SLOT_RECURSIVE
#undef RT_DELETE_RECURSIVE
#undef RT_ALLOC_NODE
#undef RT_ALLOC_LEAF
#undef RT_FREE_NODE
#undef RT_FREE_LEAF
#undef RT_FREE_RECURSE
#undef RT_EXTEND_UP
#undef RT_EXTEND_DOWN
#undef RT_COPY_COMMON
#undef RT_PTR_SET_LOCAL
#undef RT_PTR_ALLOC_IS_VALID
#undef RT_NODE_16_SEARCH_EQ
#undef RT_NODE_4_GET_INSERTPOS
#undef RT_NODE_16_GET_INSERTPOS
#undef RT_SHIFT_ARRAYS_FOR_INSERT
#undef RT_SHIFT_ARRAYS_AND_DELETE
#undef RT_COPY_ARRAYS_FOR_INSERT
#undef RT_COPY_ARRAYS_AND_DELETE
#undef RT_NODE_48_IS_CHUNK_USED
#undef RT_NODE_48_GET_CHILD
#undef RT_NODE_256_IS_CHUNK_USED
#undef RT_NODE_256_GET_CHILD
#undef RT_KEY_GET_SHIFT
#undef RT_SHIFT_GET_MAX_VAL
#undef RT_NODE_SEARCH
#undef RT_ADD_CHILD_4
#undef RT_ADD_CHILD_16
#undef RT_ADD_CHILD_48
#undef RT_ADD_CHILD_256
#undef RT_GROW_NODE_4
#undef RT_GROW_NODE_16
#undef RT_GROW_NODE_48
#undef RT_REMOVE_CHILD_4
#undef RT_REMOVE_CHILD_16
#undef RT_REMOVE_CHILD_48
#undef RT_REMOVE_CHILD_256
#undef RT_SHRINK_NODE_16
#undef RT_SHRINK_NODE_48
#undef RT_SHRINK_NODE_256
#undef RT_NODE_DELETE
#undef RT_NODE_INSERT
#undef RT_NODE_ITERATE_NEXT
#undef RT_VERIFY_NODE