Add template for adaptive radix tree

This implements a radix tree data structure based on the design in
"The Adaptive Radix Tree: ARTful Indexing for Main-Memory Databases"
by Viktor Leis, Alfons Kemper, and ThomasNeumann, 2013. The main
technique that makes it adaptive is using several different node types,
each with a different capacity of elements, and a different algorithm
for accessing them. The nodes start small and grow/shrink as needed.

The main advantage over hash tables is efficient sorted iteration and
better memory locality when successive keys are lexicographically
close together. The implementation currently assumes 64-bit integer
keys, and traversing the tree is in general slower than a linear
probing hash table, so this is not a general-purpose associative array.

The paper describes two other techniques not implemented here,
namely "path compression" and "lazy expansion". These can further
reduce memory usage and speed up traversal, but the former would add
significant complexity and the latter requires storing the full key
with the value. We do trivially compress the path when leading bytes
of the key are zeros, however.

For value storage, we use "combined pointer/value slots", as
recommended in the paper. Values of size equal or smaller than the the
platform's pointer type are stored in the array of child pointers in
the last level node, while larger values are each stored in a separate
allocation. This is for now fixed at compile time, but it would be
fairly trivial to allow determining at runtime how variable-length
values are stored.

One innovation in our implementation compared to the ART paper is
decoupling the notion of node "size class" from "kind". The size
classes within a given node kind have the same underlying type, but
a variable capacity for children, so we can introduce additional node
sizes with little additional code.

To enable different use cases to specialize for different value types
and for shared/local memory, we use macro-templatized code generation
in the same manner as simplehash.h and sort_template.h.

Future commits will use this infrastructure for storing TIDs.

Patch by Masahiko Sawada and John Naylor, but a substantial amount of
credit is due to Andres Freund, whose proof-of-concept was a valuable
source of coding idioms and awareness of performance pitfalls, and
who reviewed earlier versions.

Discussion: https://postgr.es/m/CAD21AoAfOZvmfR0j8VmZorZjL7RhTiQdVttNuC4W-Shdc2a-AA%40mail.gmail.com
This commit is contained in:
John Naylor 2024-03-07 12:40:11 +07:00
parent 65db0cfb4c
commit ee1b30f128
14 changed files with 3620 additions and 0 deletions

View File

@ -1035,6 +1035,19 @@ dsa_set_size_limit(dsa_area *area, size_t limit)
LWLockRelease(DSA_AREA_LOCK(area));
}
/* Return the total size of all active segments */
size_t
dsa_get_total_size(dsa_area *area)
{
size_t size;
LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
size = area->control->total_segment_size;
LWLockRelease(DSA_AREA_LOCK(area));
return size;
}
/*
* Aggressively free all spare memory in the hope of returning DSM segments to
* the operating system.

3009
src/include/lib/radixtree.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -121,6 +121,7 @@ extern dsa_handle dsa_get_handle(dsa_area *area);
extern dsa_pointer dsa_allocate_extended(dsa_area *area, size_t size, int flags);
extern void dsa_free(dsa_area *area, dsa_pointer dp);
extern void *dsa_get_address(dsa_area *area, dsa_pointer dp);
extern size_t dsa_get_total_size(dsa_area *area);
extern void dsa_trim(dsa_area *area);
extern void dsa_dump(dsa_area *area);

View File

@ -28,6 +28,7 @@ SUBDIRS = \
test_parser \
test_pg_dump \
test_predtest \
test_radixtree \
test_rbtree \
test_regex \
test_resowner \

View File

@ -27,6 +27,7 @@ subdir('test_oat_hooks')
subdir('test_parser')
subdir('test_pg_dump')
subdir('test_predtest')
subdir('test_radixtree')
subdir('test_rbtree')
subdir('test_regex')
subdir('test_resowner')

View File

@ -0,0 +1,4 @@
# Generated subdirectories
/log/
/results/
/tmp_check/

View File

@ -0,0 +1,23 @@
# src/test/modules/test_radixtree/Makefile
MODULE_big = test_radixtree
OBJS = \
$(WIN32RES) \
test_radixtree.o
PGFILEDESC = "test_radixtree - test code for src/include/lib/radixtree.h"
EXTENSION = test_radixtree
DATA = test_radixtree--1.0.sql
REGRESS = test_radixtree
ifdef USE_PGXS
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
else
subdir = src/test/modules/test_radixtree
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
include $(top_srcdir)/contrib/contrib-global.mk
endif

View File

@ -0,0 +1,41 @@
CREATE EXTENSION test_radixtree;
--
-- All the logic is in the test_radixtree() function. It will throw
-- an error if something fails.
--
SELECT test_radixtree();
NOTICE: testing node node-4 with shift 0 and ascending keys
NOTICE: testing node node-4 with shift 0 and descending keys
NOTICE: testing node node-4 with shift 8 and ascending keys
NOTICE: testing node node-4 with shift 8 and descending keys
NOTICE: testing node node-4 with shift 56 and ascending keys
NOTICE: testing node node-4 with shift 56 and descending keys
NOTICE: testing node node-16-lo with shift 0 and ascending keys
NOTICE: testing node node-16-lo with shift 0 and descending keys
NOTICE: testing node node-16-lo with shift 8 and ascending keys
NOTICE: testing node node-16-lo with shift 8 and descending keys
NOTICE: testing node node-16-lo with shift 56 and ascending keys
NOTICE: testing node node-16-lo with shift 56 and descending keys
NOTICE: testing node node-16-hi with shift 0 and ascending keys
NOTICE: testing node node-16-hi with shift 0 and descending keys
NOTICE: testing node node-16-hi with shift 8 and ascending keys
NOTICE: testing node node-16-hi with shift 8 and descending keys
NOTICE: testing node node-16-hi with shift 56 and ascending keys
NOTICE: testing node node-16-hi with shift 56 and descending keys
NOTICE: testing node node-48 with shift 0 and ascending keys
NOTICE: testing node node-48 with shift 0 and descending keys
NOTICE: testing node node-48 with shift 8 and ascending keys
NOTICE: testing node node-48 with shift 8 and descending keys
NOTICE: testing node node-48 with shift 56 and ascending keys
NOTICE: testing node node-48 with shift 56 and descending keys
NOTICE: testing node node-256 with shift 0 and ascending keys
NOTICE: testing node node-256 with shift 0 and descending keys
NOTICE: testing node node-256 with shift 8 and ascending keys
NOTICE: testing node node-256 with shift 8 and descending keys
NOTICE: testing node node-256 with shift 56 and ascending keys
NOTICE: testing node node-256 with shift 56 and descending keys
test_radixtree
----------------
(1 row)

View File

@ -0,0 +1,34 @@
# Copyright (c) 2024, PostgreSQL Global Development Group
test_radixtree_sources = files(
'test_radixtree.c',
)
if host_system == 'windows'
test_radixtree_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
'--NAME', 'test_radixtree',
'--FILEDESC', 'test_radixtree - test code for src/include/lib/radixtree.h',])
endif
test_radixtree = shared_module('test_radixtree',
test_radixtree_sources,
link_with: pgport_srv,
kwargs: pg_test_mod_args,
)
test_install_libs += test_radixtree
test_install_data += files(
'test_radixtree.control',
'test_radixtree--1.0.sql',
)
tests += {
'name': 'test_radixtree',
'sd': meson.current_source_dir(),
'bd': meson.current_build_dir(),
'regress': {
'sql': [
'test_radixtree',
],
},
}

View File

@ -0,0 +1,7 @@
CREATE EXTENSION test_radixtree;
--
-- All the logic is in the test_radixtree() function. It will throw
-- an error if something fails.
--
SELECT test_radixtree();

View File

@ -0,0 +1,8 @@
/* src/test/modules/test_radixtree/test_radixtree--1.0.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION test_radixtree" to load this file. \quit
CREATE FUNCTION test_radixtree()
RETURNS pg_catalog.void STRICT
AS 'MODULE_PATHNAME' LANGUAGE C;

View File

@ -0,0 +1,473 @@
/*--------------------------------------------------------------------------
*
* test_radixtree.c
* Test module for adapive radix tree.
*
* Copyright (c) 2024, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/test/modules/test_radixtree/test_radixtree.c
*
* -------------------------------------------------------------------------
*/
#include "postgres.h"
#include "common/int.h"
#include "common/pg_prng.h"
#include "fmgr.h"
#include "miscadmin.h"
#include "storage/lwlock.h"
#include "utils/memutils.h"
#include "utils/timestamp.h"
/* uncomment to use shared memory for the tree */
/* #define TEST_SHARED_RT */
#define UINT64_HEX_FORMAT "%" INT64_MODIFIER "X"
/* Convenient macros to test results */
#define EXPECT_TRUE(expr) \
do { \
if (!(expr)) \
elog(ERROR, \
"%s was unexpectedly false in file \"%s\" line %u", \
#expr, __FILE__, __LINE__); \
} while (0)
#define EXPECT_FALSE(expr) \
do { \
if (expr) \
elog(ERROR, \
"%s was unexpectedly true in file \"%s\" line %u", \
#expr, __FILE__, __LINE__); \
} while (0)
#define EXPECT_EQ_U64(result_expr, expected_expr) \
do { \
uint64 _result = (result_expr); \
uint64 _expected = (expected_expr); \
if (_result != _expected) \
elog(ERROR, \
"%s yielded " UINT64_HEX_FORMAT ", expected " UINT64_HEX_FORMAT " (%s) in file \"%s\" line %u", \
#result_expr, _result, _expected, #expected_expr, __FILE__, __LINE__); \
} while (0)
/*
* With uint64, 64-bit platforms store the value in the last-level child
* pointer, and 32-bit platforms store this in a single-value leaf.
* This gives us buildfarm coverage for both paths in this module.
*/
typedef uint64 TestValueType;
/*
* The node class name and the number of keys big enough to grow nodes
* into each size class.
*/
typedef struct rt_node_class_test_elem
{
char *class_name;
int nkeys;
} rt_node_class_test_elem;
static rt_node_class_test_elem rt_node_class_tests[] =
{
{
.class_name = "node-4", /* RT_CLASS_4 */
.nkeys = 2,
},
{
.class_name = "node-16-lo", /* RT_CLASS_16_LO */
.nkeys = 15,
},
{
.class_name = "node-16-hi", /* RT_CLASS_16_HI */
.nkeys = 30,
},
{
.class_name = "node-48", /* RT_CLASS_48 */
.nkeys = 60,
},
{
.class_name = "node-256", /* RT_CLASS_256 */
.nkeys = 256,
},
};
/* define the radix tree implementation to test */
#define RT_PREFIX rt
#define RT_SCOPE
#define RT_DECLARE
#define RT_DEFINE
#define RT_USE_DELETE
#define RT_VALUE_TYPE TestValueType
#ifdef TEST_SHARED_RT
#define RT_SHMEM
#endif
#define RT_DEBUG
#include "lib/radixtree.h"
/*
* Return the number of keys in the radix tree.
*/
static uint64
rt_num_entries(rt_radix_tree * tree)
{
return tree->ctl->num_keys;
}
PG_MODULE_MAGIC;
PG_FUNCTION_INFO_V1(test_radixtree);
static void
test_empty(void)
{
MemoryContext radixtree_ctx;
rt_radix_tree *radixtree;
rt_iter *iter;
uint64 key;
#ifdef TEST_SHARED_RT
int tranche_id = LWLockNewTrancheId();
dsa_area *dsa;
#endif
radixtree_ctx = AllocSetContextCreate(CurrentMemoryContext,
"test_radix_tree",
ALLOCSET_SMALL_SIZES);
#ifdef TEST_SHARED_RT
LWLockRegisterTranche(tranche_id, "test_radix_tree");
dsa = dsa_create(tranche_id);
radixtree = rt_create(radixtree_ctx, dsa, tranche_id);
#else
radixtree = rt_create(radixtree_ctx);
#endif
/* Should not find anything in an empty tree */
EXPECT_TRUE(rt_find(radixtree, 0) == NULL);
EXPECT_TRUE(rt_find(radixtree, 1) == NULL);
EXPECT_TRUE(rt_find(radixtree, PG_UINT64_MAX) == NULL);
EXPECT_FALSE(rt_delete(radixtree, 0));
EXPECT_TRUE(rt_num_entries(radixtree) == 0);
/* Iterating on an empty tree should not return anything */
iter = rt_begin_iterate(radixtree);
EXPECT_TRUE(rt_iterate_next(iter, &key) == NULL);
rt_end_iterate(iter);
rt_free(radixtree);
#ifdef TEST_SHARED_RT
dsa_detach(dsa);
#endif
}
/* Basic set, find, and delete tests */
static void
test_basic(rt_node_class_test_elem *test_info, int shift, bool asc)
{
MemoryContext radixtree_ctx;
rt_radix_tree *radixtree;
rt_iter *iter;
uint64 *keys;
int children = test_info->nkeys;
#ifdef TEST_SHARED_RT
int tranche_id = LWLockNewTrancheId();
dsa_area *dsa;
#endif
radixtree_ctx = AllocSetContextCreate(CurrentMemoryContext,
"test_radix_tree",
ALLOCSET_SMALL_SIZES);
#ifdef TEST_SHARED_RT
LWLockRegisterTranche(tranche_id, "test_radix_tree");
dsa = dsa_create(tranche_id);
radixtree = rt_create(radixtree_ctx, dsa, tranche_id);
#else
radixtree = rt_create(radixtree_ctx);
#endif
elog(NOTICE, "testing node %s with shift %d and %s keys",
test_info->class_name, shift, asc ? "ascending" : "descending");
keys = palloc(sizeof(uint64) * children);
for (int i = 0; i < children; i++)
{
if (asc)
keys[i] = (uint64) i << shift;
else
keys[i] = (uint64) (children - 1 - i) << shift;
}
/*
* Insert keys. Since the tree was just created, rt_set should return
* false.
*/
for (int i = 0; i < children; i++)
EXPECT_FALSE(rt_set(radixtree, keys[i], (TestValueType *) & keys[i]));
rt_stats(radixtree);
/* look up keys */
for (int i = 0; i < children; i++)
{
TestValueType *value;
value = rt_find(radixtree, keys[i]);
/* Test rt_find returns the expected value */
EXPECT_TRUE(value != NULL);
EXPECT_EQ_U64(*value, (TestValueType) keys[i]);
}
/* update keys */
for (int i = 0; i < children; i++)
{
TestValueType update = keys[i] + 1;
/* rt_set should report the key found */
EXPECT_TRUE(rt_set(radixtree, keys[i], (TestValueType *) & update));
}
/* delete and re-insert keys */
for (int i = 0; i < children; i++)
{
EXPECT_TRUE(rt_delete(radixtree, keys[i]));
EXPECT_FALSE(rt_set(radixtree, keys[i], (TestValueType *) & keys[i]));
}
/* look up keys after deleting and re-inserting */
for (int i = 0; i < children; i++)
{
TestValueType *value;
value = rt_find(radixtree, keys[i]);
/* Test that rt_find returns the expected value */
EXPECT_TRUE(value != NULL);
EXPECT_EQ_U64(*value, (TestValueType) keys[i]);
}
/* test that iteration returns the expected keys and values */
iter = rt_begin_iterate(radixtree);
for (int i = 0; i < children; i++)
{
uint64 expected;
uint64 iterkey;
TestValueType *iterval;
/* iteration is ordered by key, so adjust expected value accordingly */
if (asc)
expected = keys[i];
else
expected = keys[children - 1 - i];
iterval = rt_iterate_next(iter, &iterkey);
EXPECT_TRUE(iterval != NULL);
EXPECT_EQ_U64(iterkey, expected);
EXPECT_EQ_U64(*iterval, expected);
}
rt_end_iterate(iter);
/* delete all keys again */
for (int i = 0; i < children; i++)
EXPECT_TRUE(rt_delete(radixtree, keys[i]));
/* test that all keys were deleted */
for (int i = 0; i < children; i++)
EXPECT_TRUE(rt_find(radixtree, keys[i]) == NULL);
rt_stats(radixtree);
pfree(keys);
rt_free(radixtree);
#ifdef TEST_SHARED_RT
dsa_detach(dsa);
#endif
}
static int
key_cmp(const void *a, const void *b)
{
return pg_cmp_u64(*(const uint64 *) a, *(const uint64 *) b);
}
static void
test_random(void)
{
MemoryContext radixtree_ctx;
rt_radix_tree *radixtree;
rt_iter *iter;
pg_prng_state state;
/* limit memory usage by limiting the key space */
uint64 filter = ((uint64) (0x07 << 24) | (0xFF << 16) | 0xFF);
uint64 seed = GetCurrentTimestamp();
int num_keys = 100000;
uint64 *keys;
#ifdef TEST_SHARED_RT
int tranche_id = LWLockNewTrancheId();
dsa_area *dsa;
#endif
radixtree_ctx = AllocSetContextCreate(CurrentMemoryContext,
"test_radix_tree",
ALLOCSET_SMALL_SIZES);
#ifdef TEST_SHARED_RT
LWLockRegisterTranche(tranche_id, "test_radix_tree");
dsa = dsa_create(tranche_id);
radixtree = rt_create(radixtree_ctx, dsa, tranche_id);
#else
radixtree = rt_create(radixtree_ctx);
#endif
/* add some random values */
pg_prng_seed(&state, seed);
keys = (TestValueType *) palloc(sizeof(uint64) * num_keys);
for (uint64 i = 0; i < num_keys; i++)
{
uint64 key = pg_prng_uint64(&state) & filter;
TestValueType val = (TestValueType) key;
/* save in an array */
keys[i] = key;
rt_set(radixtree, key, &val);
}
rt_stats(radixtree);
for (uint64 i = 0; i < num_keys; i++)
{
TestValueType *value;
value = rt_find(radixtree, keys[i]);
/* Test rt_find for values just inserted */
EXPECT_TRUE(value != NULL);
EXPECT_EQ_U64(*value, keys[i]);
}
/* sort keys for iteration and absence tests */
qsort(keys, num_keys, sizeof(uint64), key_cmp);
/* should not find numbers in between the keys */
for (uint64 i = 0; i < num_keys - 1; i++)
{
TestValueType *value;
/* skip duplicate and adjacent keys */
if (keys[i + 1] == keys[i] || keys[i + 1] == keys[i] + 1)
continue;
/* should not find the number right after key */
value = rt_find(radixtree, keys[i] + 1);
EXPECT_TRUE(value == NULL);
}
/* should not find numbers lower than lowest key */
for (uint64 key = 0; key < keys[0]; key++)
{
TestValueType *value;
/* arbitrary stopping point */
if (key > 10000)
break;
value = rt_find(radixtree, key);
EXPECT_TRUE(value == NULL);
}
/* should not find numbers higher than highest key */
for (uint64 i = 1; i < 10000; i++)
{
TestValueType *value;
value = rt_find(radixtree, keys[num_keys - 1] + i);
EXPECT_TRUE(value == NULL);
}
/* test that iteration returns the expected keys and values */
iter = rt_begin_iterate(radixtree);
for (int i = 0; i < num_keys; i++)
{
uint64 expected;
uint64 iterkey;
TestValueType *iterval;
/* skip duplicate keys */
if (i < num_keys - 1 && keys[i + 1] == keys[i])
continue;
expected = keys[i];
iterval = rt_iterate_next(iter, &iterkey);
EXPECT_TRUE(iterval != NULL);
EXPECT_EQ_U64(iterkey, expected);
EXPECT_EQ_U64(*iterval, expected);
}
rt_end_iterate(iter);
/* reset random number generator for deletion */
pg_prng_seed(&state, seed);
/* delete in original random order */
for (uint64 i = 0; i < num_keys; i++)
{
uint64 key = pg_prng_uint64(&state) & filter;
rt_delete(radixtree, key);
}
EXPECT_TRUE(rt_num_entries(radixtree) == 0);
pfree(keys);
rt_free(radixtree);
#ifdef TEST_SHARED_RT
dsa_detach(dsa);
#endif
}
Datum
test_radixtree(PG_FUNCTION_ARGS)
{
/* borrowed from RT_MAX_SHIFT */
const int max_shift = (sizeof(uint64) - 1) * BITS_PER_BYTE;
test_empty();
for (int i = 0; i < lengthof(rt_node_class_tests); i++)
{
rt_node_class_test_elem *test_info = &(rt_node_class_tests[i]);
/* a tree with one level, i.e. a single node under the root node */
test_basic(test_info, 0, true);
test_basic(test_info, 0, false);
/* a tree with two levels */
test_basic(test_info, 8, true);
test_basic(test_info, 8, false);
/* a tree with the maximum number of levels */
test_basic(test_info, max_shift, true);
test_basic(test_info, max_shift, false);
}
test_random();
PG_RETURN_VOID();
}

View File

@ -0,0 +1,4 @@
comment = 'Test code for radix tree'
default_version = '1.0'
module_pathname = '$libdir/test_radixtree'
relocatable = true

View File

@ -3736,6 +3736,7 @@ rewind_source
rewrite_event
rf_context
rm_detail_t
rt_node_class_test_elem
role_auth_extra
rolename_hash
row_security_policy_hook_type