Add IntegerSet, to hold large sets of 64-bit ints efficiently.

The set is implemented as a B-tree, with a compact representation at leaf
items, using Simple-8b algorithm, so that clusters of nearby values use
less memory.

The IntegerSet isn't used for anything yet, aside from the test code, but
we have two patches in the works that would benefit from this: A patch to
allow GiST vacuum to delete empty pages, and a patch to reduce heap
VACUUM's memory usage, by storing the list of dead TIDs more efficiently
and lifting the 1 GB limit on its size.

This includes a unit test module, in src/test/modules/test_integerset.
It can be used to verify correctness, as a regression test, but if you run
it manully, it can also print memory usage and execution time of some of
the tests.

Author: Heikki Linnakangas, Andrey Borodin
Reviewed-by: Julien Rouhaud
Discussion: https://www.postgresql.org/message-id/b5e82599-1966-5783-733c-1a947ddb729f@iki.fi
This commit is contained in:
Heikki Linnakangas 2019-03-22 13:21:19 +02:00
parent 5e1963fb76
commit df816f6ad5
13 changed files with 1738 additions and 1 deletions

View File

@ -13,6 +13,6 @@ top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
OBJS = binaryheap.o bipartite_match.o bloomfilter.o dshash.o hyperloglog.o \
ilist.o knapsack.o pairingheap.o rbtree.o stringinfo.o
ilist.o integerset.o knapsack.o pairingheap.o rbtree.o stringinfo.o
include $(top_srcdir)/src/backend/common.mk

View File

@ -13,6 +13,8 @@ hyperloglog.c - a streaming cardinality estimator
ilist.c - single and double-linked lists
integerset.c - a data structure for holding large set of integers
knapsack.c - knapsack problem solver
pairingheap.c - a pairing heap

1019
src/backend/lib/integerset.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,24 @@
/*
* integerset.h
* In-memory data structure to hold a large set of integers efficiently
*
* Portions Copyright (c) 2012-2019, PostgreSQL Global Development Group
*
* src/include/lib/integerset.h
*/
#ifndef INTEGERSET_H
#define INTEGERSET_H
typedef struct IntegerSet IntegerSet;
extern IntegerSet *intset_create(void);
extern void intset_add_member(IntegerSet *intset, uint64 x);
extern bool intset_is_member(IntegerSet *intset, uint64 x);
extern uint64 intset_num_entries(IntegerSet *intset);
extern uint64 intset_memory_usage(IntegerSet *intset);
extern void intset_begin_iterate(IntegerSet *intset);
extern bool intset_iterate_next(IntegerSet *intset, uint64 *next);
#endif /* INTEGERSET_H */

View File

@ -12,6 +12,7 @@ SUBDIRS = \
test_bloomfilter \
test_ddl_deparse \
test_extensions \
test_integerset \
test_parser \
test_pg_dump \
test_predtest \

View File

@ -0,0 +1,4 @@
# Generated subdirectories
/log/
/results/
/tmp_check/

View File

@ -0,0 +1,21 @@
# src/test/modules/test_integerset/Makefile
MODULE_big = test_integerset
OBJS = test_integerset.o $(WIN32RES)
PGFILEDESC = "test_integerset - test code for src/backend/lib/integerset.c"
EXTENSION = test_integerset
DATA = test_integerset--1.0.sql
REGRESS = test_integerset
ifdef USE_PGXS
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
else
subdir = src/test/modules/test_integerset
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
include $(top_srcdir)/contrib/contrib-global.mk
endif

View File

@ -0,0 +1,7 @@
test_integerset contains unit tests for testing the integer set implementation,
in src/backend/lib/integerset.c
The tests verify the correctness of the implemention, but they can also be
as a micro-benchmark: If you set the 'intset_tests_stats' flag in
test_integerset.c, the tests will print extra information about execution time
and memory usage.

View File

@ -0,0 +1,14 @@
CREATE EXTENSION test_integerset;
--
-- These tests don't produce any interesting output. We're checking that
-- the operations complete without crashing or hanging and that none of their
-- internal sanity tests fail. They print progress information as INFOs,
-- which are not interesting for automated tests, so suppress those.
--
SET client_min_messages = 'warning';
SELECT test_integerset();
test_integerset
-----------------
(1 row)

View File

@ -0,0 +1,11 @@
CREATE EXTENSION test_integerset;
--
-- These tests don't produce any interesting output. We're checking that
-- the operations complete without crashing or hanging and that none of their
-- internal sanity tests fail. They print progress information as INFOs,
-- which are not interesting for automated tests, so suppress those.
--
SET client_min_messages = 'warning';
SELECT test_integerset();

View File

@ -0,0 +1,8 @@
/* src/test/modules/test_integerset/test_integerset--1.0.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION test_integerset" to load this file. \quit
CREATE FUNCTION test_integerset()
RETURNS pg_catalog.void STRICT
AS 'MODULE_PATHNAME' LANGUAGE C;

View File

@ -0,0 +1,622 @@
/*--------------------------------------------------------------------------
*
* test_integerset.c
* Test integer set data structure.
*
* Copyright (c) 2019, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/test/modules/test_integerset/test_integerset.c
*
* -------------------------------------------------------------------------
*/
#include "postgres.h"
#include "fmgr.h"
#include "lib/integerset.h"
#include "nodes/bitmapset.h"
#include "utils/memutils.h"
#include "utils/timestamp.h"
#include "storage/block.h"
#include "storage/itemptr.h"
#include "miscadmin.h"
/*
* If you enable this, the "pattern" tests will print information about
* how long populating, probing, and iterating the test set takes, and
* how much memory the test set consumed. That can be used as
* micro-benchmark of various operations and input patterns (you might
* want to increase the number of values used in each of the test, if
* you do that, to reduce noise)
*
* The information is printed to the server's stderr, mostly because
* that's where MemoryContextStats() output goes.
*/
static const bool intset_test_stats = false;
PG_MODULE_MAGIC;
PG_FUNCTION_INFO_V1(test_integerset);
/*
* A struct to define a pattern of integers, for use with test_pattern()
* function.
*/
typedef struct
{
char *test_name; /* short name of the test, for humans */
char *pattern_str; /* a bit pattern */
uint64 spacing; /* pattern repeats at this interval */
uint64 num_values; /* number of integers to set in total */
} test_spec;
static const test_spec test_specs[] = {
{
"all ones", "1111111111",
10, 10000000
},
{
"alternating bits", "0101010101",
10, 10000000
},
{
"clusters of ten", "1111111111",
10000, 10000000
},
{
"clusters of hundred",
"1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111",
10000, 100000000
},
{
"one-every-64k", "1",
65536, 10000000
},
{
"sparse", "100000000000000000000000000000001",
10000000, 10000000
},
{
"single values, distance > 2^32", "1",
10000000000L, 1000000
},
{
"clusters, distance > 2^32", "10101010",
10000000000L, 10000000
},
{
"clusters, distance > 2^60", "10101010",
2000000000000000000L,
23 /* can't be much higher than this, or we
* overflow uint64 */
}
};
static void test_pattern(const test_spec *spec);
static void test_empty(void);
static void test_single_value(uint64 value);
static void check_with_filler(IntegerSet *intset, uint64 x, uint64 value, uint64 filler_min, uint64 filler_max);
static void test_single_value_and_filler(uint64 value, uint64 filler_min, uint64 filler_max);
static void test_huge_distances(void);
/*
* SQL-callable entry point to perform all tests.
*/
Datum
test_integerset(PG_FUNCTION_ARGS)
{
MemoryContext test_ctx;
test_ctx = AllocSetContextCreate(CurrentMemoryContext,
"test_integerset context",
ALLOCSET_DEFAULT_SIZES);
/* Tests for various corner cases */
test_empty();
test_huge_distances();
test_single_value(0);
test_single_value(1);
test_single_value(PG_UINT64_MAX - 1);
test_single_value(PG_UINT64_MAX);
test_single_value_and_filler(0, 1000, 2000);
test_single_value_and_filler(1, 1000, 2000);
test_single_value_and_filler(1, 1000, 2000000);
test_single_value_and_filler(PG_UINT64_MAX - 1, 1000, 2000);
test_single_value_and_filler(PG_UINT64_MAX, 1000, 2000);
/* Test different test patterns, with lots of entries */
for (int i = 0; i < lengthof(test_specs); i++)
{
MemoryContextReset(test_ctx);
test_pattern(&test_specs[i]);
}
MemoryContextDelete(test_ctx);
PG_RETURN_VOID();
}
/*
* Test with a repeating pattern, defined by the 'spec'.
*/
static void
test_pattern(const test_spec *spec)
{
IntegerSet *intset;
MemoryContext intset_ctx;
MemoryContext old_ctx;
TimestampTz starttime;
TimestampTz endtime;
uint64 n;
uint64 last_int;
int patternlen;
uint64 *pattern_values;
uint64 pattern_num_values;
elog(NOTICE, "testing intset with pattern \"%s\"", spec->test_name);
if (intset_test_stats)
fprintf(stderr, "-----\ntesting intset with pattern \"%s\"\n", spec->test_name);
/* Pre-process the pattern, creating an array of integers from it. */
patternlen = strlen(spec->pattern_str);
pattern_values = palloc(patternlen * sizeof(uint64));
pattern_num_values = 0;
for (int i = 0; i < patternlen; i++)
{
if (spec->pattern_str[i] == '1')
pattern_values[pattern_num_values++] = i;
}
/*
* Allocate the integer set.
*
* Allocate it in a separate memory context, so that we can print its
* memory usage easily. (intset_create() creates a memory context of its
* own, too, but we don't have direct access to it, so we cannot call
* MemoryContextStats() on it directly).
*/
intset_ctx = AllocSetContextCreate(CurrentMemoryContext,
"intset test",
ALLOCSET_SMALL_SIZES);
MemoryContextSetIdentifier(intset_ctx, spec->test_name);
old_ctx = MemoryContextSwitchTo(intset_ctx);
intset = intset_create();
MemoryContextSwitchTo(old_ctx);
/*
* Add values to the set.
*/
starttime = GetCurrentTimestamp();
n = 0;
last_int = 0;
while (n < spec->num_values)
{
uint64 x = 0;
for (int i = 0; i < pattern_num_values && n < spec->num_values; i++)
{
x = last_int + pattern_values[i];
intset_add_member(intset, x);
n++;
}
last_int += spec->spacing;
}
endtime = GetCurrentTimestamp();
if (intset_test_stats)
fprintf(stderr, "added %lu values in %lu ms\n",
spec->num_values, (endtime - starttime) / 1000);
/*
* Print stats on the amount of memory used.
*
* We print the usage reported by intset_memory_usage(), as well as the
* stats from the memory context. They should be in the same ballpark,
* but it's hard to automate testing that, so if you're making changes to
* the implementation, just observe that manually.
*/
if (intset_test_stats)
{
uint64 mem_usage;
/*
* Also print memory usage as reported by intset_memory_usage(). It
* should be in the same ballpark as the usage reported by
* MemoryContextStats().
*/
mem_usage = intset_memory_usage(intset);
fprintf(stderr, "intset_memory_usage() reported %lu (%0.2f bytes / integer)\n",
mem_usage, (double) mem_usage / spec->num_values);
MemoryContextStats(intset_ctx);
}
/* Check that intset_get_num_entries works */
n = intset_num_entries(intset);
if (n != spec->num_values)
elog(ERROR, "intset_num_entries returned %lu, expected %lu", n, spec->num_values);
/*
* Test random-access probes with intset_is_member()
*/
starttime = GetCurrentTimestamp();
for (n = 0; n < 100000; n++)
{
bool b;
bool expected;
uint64 x;
/*
* Pick next value to probe at random. We limit the probes to the
* last integer that we added to the set, plus an arbitrary constant
* (1000). There's no point in probing the whole 0 - 2^64 range, if
* only a small part of the integer space is used. We would very
* rarely hit hit values that are actually in the set.
*/
x = (pg_lrand48() << 31) | pg_lrand48();
x = x % (last_int + 1000);
/* Do we expect this value to be present in the set? */
if (x >= last_int)
expected = false;
else
{
uint64 idx = x % spec->spacing;
if (idx >= patternlen)
expected = false;
else if (spec->pattern_str[idx] == '1')
expected = true;
else
expected = false;
}
/* Is it present according to intset_is_member() ? */
b = intset_is_member(intset, x);
if (b != expected)
elog(ERROR, "mismatch at %lu: %d vs %d", x, b, expected);
}
endtime = GetCurrentTimestamp();
if (intset_test_stats)
fprintf(stderr, "probed %lu values in %lu ms\n", n, (endtime - starttime) / 1000);
/*
* Test iterator
*/
starttime = GetCurrentTimestamp();
intset_begin_iterate(intset);
n = 0;
last_int = 0;
while (n < spec->num_values)
{
for (int i = 0; i < pattern_num_values && n < spec->num_values; i++)
{
uint64 expected = last_int + pattern_values[i];
uint64 x;
if (!intset_iterate_next(intset, &x))
break;
if (x != expected)
elog(ERROR, "iterate returned wrong value; got %lu, expected %lu", x, expected);
n++;
}
last_int += spec->spacing;
}
endtime = GetCurrentTimestamp();
if (intset_test_stats)
fprintf(stderr, "iterated %lu values in %lu ms\n", n, (endtime - starttime) / 1000);
if (n < spec->num_values)
elog(ERROR, "iterator stopped short after %lu entries, expected %lu", n, spec->num_values);
if (n > spec->num_values)
elog(ERROR, "iterator returned %lu entries, %lu was expected", n, spec->num_values);
MemoryContextDelete(intset_ctx);
}
/*
* Test with a set containing a single integer.
*/
static void
test_single_value(uint64 value)
{
IntegerSet *intset;
uint64 x;
uint64 num_entries;
bool found;
elog(NOTICE, "testing intset with single value %lu", value);
/* Create the set. */
intset = intset_create();
intset_add_member(intset, value);
/* Test intset_get_num_entries() */
num_entries = intset_num_entries(intset);
if (num_entries != 1)
elog(ERROR, "intset_num_entries returned %lu, expected %lu", num_entries, 1L);
/*
* Test intset_is_member() at various special values, like 0 and and
* maximum possible 64-bit integer, as well as the value itself.
*/
if (intset_is_member(intset, 0) != (value == 0))
elog(ERROR, "intset_is_member failed for 0");
if (intset_is_member(intset, 1) != (value == 1))
elog(ERROR, "intset_is_member failed for 1");
if (intset_is_member(intset, PG_UINT64_MAX) != (value == PG_UINT64_MAX))
elog(ERROR, "intset_is_member failed for PG_UINT64_MAX");
if (intset_is_member(intset, value) != true)
elog(ERROR, "intset_is_member failed for the tested value");
/*
* Test iterator
*/
intset_begin_iterate(intset);
found = intset_iterate_next(intset, &x);
if (!found || x != value)
elog(ERROR, "intset_iterate_next failed for %lu", x);
found = intset_iterate_next(intset, &x);
if (found)
elog(ERROR, "intset_iterate_next failed %lu", x);
}
/*
* Test with an integer set that contains:
*
* - a given single 'value', and
* - all integers between 'filler_min' and 'filler_max'.
*
* This exercises different codepaths than testing just with a single value,
* because the implementation buffers newly-added values. If we add just
* single value to the set, we won't test the internal B-tree code at all,
* just the code that deals with the buffer.
*/
static void
test_single_value_and_filler(uint64 value, uint64 filler_min, uint64 filler_max)
{
IntegerSet *intset;
uint64 x;
bool found;
uint64 *iter_expected;
uint64 n = 0;
uint64 num_entries = 0;
uint64 mem_usage;
elog(NOTICE, "testing intset with value %lu, and all between %lu and %lu",
value, filler_min, filler_max);
intset = intset_create();
iter_expected = palloc(sizeof(uint64) * (filler_max - filler_min + 1));
if (value < filler_min)
{
intset_add_member(intset, value);
iter_expected[n++] = value;
}
for (x = filler_min; x < filler_max; x++)
{
intset_add_member(intset, x);
iter_expected[n++] = x;
}
if (value >= filler_max)
{
intset_add_member(intset, value);
iter_expected[n++] = value;
}
/* Test intset_get_num_entries() */
num_entries = intset_num_entries(intset);
if (num_entries != n)
elog(ERROR, "intset_num_entries returned %lu, expected %lu", num_entries, n);
/*
* Test intset_is_member() at various spots, at and around the values that
* we expect to be set, as well as 0 and the maximum possible value.
*/
check_with_filler(intset, 0,
value, filler_min, filler_max);
check_with_filler(intset, 1,
value, filler_min, filler_max);
check_with_filler(intset, filler_min - 1,
value, filler_min, filler_max);
check_with_filler(intset, filler_min,
value, filler_min, filler_max);
check_with_filler(intset, filler_min + 1,
value, filler_min, filler_max);
check_with_filler(intset, value - 1,
value, filler_min, filler_max);
check_with_filler(intset, value,
value, filler_min, filler_max);
check_with_filler(intset, value + 1,
value, filler_min, filler_max);
check_with_filler(intset, filler_max - 1,
value, filler_min, filler_max);
check_with_filler(intset, filler_max,
value, filler_min, filler_max);
check_with_filler(intset, filler_max + 1,
value, filler_min, filler_max);
check_with_filler(intset, PG_UINT64_MAX - 1,
value, filler_min, filler_max);
check_with_filler(intset, PG_UINT64_MAX,
value, filler_min, filler_max);
intset_begin_iterate(intset);
for (uint64 i = 0; i < n; i++)
{
found = intset_iterate_next(intset, &x);
if (!found || x != iter_expected[i])
elog(ERROR, "intset_iterate_next failed for %lu", x);
}
found = intset_iterate_next(intset, &x);
if (found)
elog(ERROR, "intset_iterate_next failed %lu", x);
mem_usage = intset_memory_usage(intset);
if (mem_usage < 5000 || mem_usage > 500000000)
elog(ERROR, "intset_memory_usage() reported suspicous value: %lu", mem_usage);
}
/*
* Helper function for test_single_value_and_filler.
*
* Calls intset_is_member() for value 'x', and checks that the result is what
* we expect.
*/
static void
check_with_filler(IntegerSet *intset, uint64 x,
uint64 value, uint64 filler_min, uint64 filler_max)
{
bool expected;
bool actual;
expected = (x == value || (filler_min <= x && x < filler_max));
actual = intset_is_member(intset, x);
if (actual != expected)
elog(ERROR, "intset_is_member failed for %lu", x);
}
/*
* Test empty set
*/
static void
test_empty(void)
{
IntegerSet *intset;
uint64 x;
elog(NOTICE, "testing intset with empty set");
intset = intset_create();
/* Test intset_is_member() */
if (intset_is_member(intset, 0) != false)
elog(ERROR, "intset_is_member on empty set returned true");
if (intset_is_member(intset, 1) != false)
elog(ERROR, "intset_is_member on empty set returned true");
if (intset_is_member(intset, PG_UINT64_MAX) != false)
elog(ERROR, "intset_is_member on empty set returned true");
/* Test iterator */
intset_begin_iterate(intset);
if (intset_iterate_next(intset, &x))
elog(ERROR, "intset_iterate_next on empty set returned a value (%lu)", x);
}
/*
* Test with integers that are more than 2^60 apart.
*
* The Simple-8b encoding used by the set implementation can only encode
* values up to 2^60. That makes large differences like this interesting
* to test.
*/
static void
test_huge_distances(void)
{
IntegerSet *intset;
uint64 values[1000];
int num_values = 0;
uint64 val = 0;
bool found;
uint64 x;
elog(NOTICE, "testing intset with distances > 2^60 between values");
val = 0;
values[num_values++] = val;
val += 1152921504606846976L - 1; /* 2^60 - 1 */
values[num_values++] = val;
val += 1152921504606846976L - 1; /* 2^60 - 1 */
values[num_values++] = val;
val += 1152921504606846976L; /* 2^60 */
values[num_values++] = val;
val += 1152921504606846976L; /* 2^60 */
values[num_values++] = val;
val += 1152921504606846976L; /* 2^60 */
values[num_values++] = val;
val += 1152921504606846976L + 1; /* 2^60 + 1 */
values[num_values++] = val;
val += 1152921504606846976L + 1; /* 2^60 + 1 */
values[num_values++] = val;
val += 1152921504606846976L + 1; /* 2^60 + 1 */
values[num_values++] = val;
val += 1152921504606846976L; /* 2^60 */
values[num_values++] = val;
/* we're now very close to 2^64, so can't add large values anymore */
intset = intset_create();
/*
* Add many more values to the end, to make sure that all the above values
* get flushed and packed into the tree structure.
*/
while (num_values < 1000)
{
val += pg_lrand48();
values[num_values++] = val;
}
/* Add these numbers to the set */
for (int i = 0; i < num_values; i++)
intset_add_member(intset, values[i]);
/*
* Test iterset_is_member() around each of these values
*/
for (int i = 0; i < num_values; i++)
{
uint64 x = values[i];
bool result;
if (x > 0)
{
result = intset_is_member(intset, x - 1);
if (result != false)
elog(ERROR, "intset_is_member failed for %lu", x - 1);
}
result = intset_is_member(intset, x);
if (result != true)
elog(ERROR, "intset_is_member failed for %lu", x);
result = intset_is_member(intset, x + 1);
if (result != false)
elog(ERROR, "intset_is_member failed for %lu", x + 1);
}
/*
* Test iterator
*/
intset_begin_iterate(intset);
for (int i = 0; i < num_values; i++)
{
found = intset_iterate_next(intset, &x);
if (!found || x != values[i])
elog(ERROR, "intset_iterate_next failed for %lu", x);
}
found = intset_iterate_next(intset, &x);
if (found)
elog(ERROR, "intset_iterate_next failed %lu", x);
}

View File

@ -0,0 +1,4 @@
comment = 'Test code for integerset'
default_version = '1.0'
module_pathname = '$libdir/test_integerset'
relocatable = true