Add selectivity estimation functions for intarray operators.

Uriy Zhuravlev and Alexander Korotkov, reviewed by Jeff Janes, some cleanup
by me.
This commit is contained in:
Heikki Linnakangas 2015-07-21 20:54:18 +03:00
parent 434873806a
commit c6fbe6d6fb
7 changed files with 443 additions and 17 deletions

View File

@ -2,10 +2,10 @@
MODULE_big = _int
OBJS = _int_bool.o _int_gist.o _int_op.o _int_tool.o \
_intbig_gist.o _int_gin.o $(WIN32RES)
_intbig_gist.o _int_gin.o _int_selfuncs.o $(WIN32RES)
EXTENSION = intarray
DATA = intarray--1.0.sql intarray--unpackaged--1.0.sql
DATA = intarray--1.1.sql intarray--1.0--1.1.sql intarray--unpackaged--1.0.sql
PGFILEDESC = "intarray - functions and operators for arrays of integers"
REGRESS = _int

View File

@ -0,0 +1,341 @@
/*-------------------------------------------------------------------------
*
* _int_selfuncs.c
* Functions for selectivity estimation of intarray operators
*
* Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* contrib/intarray/_int_selfuncs.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "_int.h"
#include "access/htup_details.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_statistic.h"
#include "catalog/pg_type.h"
#include "utils/selfuncs.h"
#include "utils/syscache.h"
#include "utils/lsyscache.h"
#include "miscadmin.h"
PG_FUNCTION_INFO_V1(_int_overlap_sel);
PG_FUNCTION_INFO_V1(_int_contains_sel);
PG_FUNCTION_INFO_V1(_int_contained_sel);
PG_FUNCTION_INFO_V1(_int_overlap_joinsel);
PG_FUNCTION_INFO_V1(_int_contains_joinsel);
PG_FUNCTION_INFO_V1(_int_contained_joinsel);
PG_FUNCTION_INFO_V1(_int_matchsel);
Datum _int_overlap_sel(PG_FUNCTION_ARGS);
Datum _int_contains_sel(PG_FUNCTION_ARGS);
Datum _int_contained_sel(PG_FUNCTION_ARGS);
Datum _int_overlap_joinsel(PG_FUNCTION_ARGS);
Datum _int_contains_joinsel(PG_FUNCTION_ARGS);
Datum _int_contained_joinsel(PG_FUNCTION_ARGS);
Datum _int_matchsel(PG_FUNCTION_ARGS);
static Selectivity int_query_opr_selec(ITEM *item, Datum *values, float4 *freqs,
int nmncelems, float4 minfreq);
static int compare_val_int4(const void *a, const void *b);
/*
* Wrappers around the default array selectivity estimation functions.
*
* The default array selectivity operators for the @>, && and @< operators
* work fine for integer arrays. However, if we tried to just use arraycontsel
* and arracontjoinsel directly as the cost estimator functions for our
* operators, they would not work as intended, because they look at the
* operator's OID. Our operators behave exactly like the built-in anyarray
* versions, but we must tell the cost estimator functions which built-in
* operators they correspond to. These wrappers just replace the operator
* OID with the corresponding built-in operator's OID, and call the built-in
* function.
*/
Datum
_int_overlap_sel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3)));
}
Datum
_int_contains_sel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3)));
}
Datum
_int_contained_sel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3)));
}
Datum
_int_overlap_joinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3),
PG_GETARG_DATUM(4)));
}
Datum
_int_contains_joinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3),
PG_GETARG_DATUM(4)));
}
Datum
_int_contained_joinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3),
PG_GETARG_DATUM(4)));
}
/*
* _int_matchsel -- restriction selectivity function for intarray @@ query_int
*/
Datum
_int_matchsel(PG_FUNCTION_ARGS)
{
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
List *args = (List *) PG_GETARG_POINTER(2);
int varRelid = PG_GETARG_INT32(3);
VariableStatData vardata;
Node *other;
bool varonleft;
Selectivity selec;
QUERYTYPE *query;
Datum *mcelems = NULL;
float4 *mcefreqs = NULL;
int nmcelems = 0;
float4 minfreq = 0.0;
float4 nullfrac = 0.0;
Form_pg_statistic stats;
Datum *values = NULL;
int nvalues = 0;
float4 *numbers = NULL;
int nnumbers = 0;
/*
* If expression is not "variable @@ something" or "something @@ variable"
* then punt and return a default estimate.
*/
if (!get_restriction_variable(root, args, varRelid,
&vardata, &other, &varonleft))
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
/*
* Variable should be int[]. We don't support cases where variable is
* query_int.
*/
if (vardata.vartype != INT4ARRAYOID)
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
/*
* Can't do anything useful if the something is not a constant, either.
*/
if (!IsA(other, Const))
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
}
/*
* The "@@" operator is strict, so we can cope with NULL right away.
*/
if (((Const *) other)->constisnull)
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(0.0);
}
/* The caller made sure the const is a query, so get it now */
query = DatumGetQueryTypeP(((Const *) other)->constvalue);
/* Empty query matches nothing */
if (query->size == 0)
{
ReleaseVariableStats(vardata);
return (Selectivity) 0.0;
}
/*
* Get the statistics for the intarray column.
*
* We're interested in the Most-Common-Elements list, and the NULL
* fraction.
*/
if (HeapTupleIsValid(vardata.statsTuple))
{
stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
nullfrac = stats->stanullfrac;
/*
* For an int4 array, the default array type analyze function will
* collect a Most Common Elements list, which is an array of int4s.
*/
if (get_attstatsslot(vardata.statsTuple,
INT4OID, -1,
STATISTIC_KIND_MCELEM, InvalidOid,
NULL,
&values, &nvalues,
&numbers, &nnumbers))
{
/*
* There should be three more Numbers than Values, because the
* last three (for intarray) cells are taken for minimal, maximal
* and nulls frequency. Punt if not.
*/
if (nnumbers == nvalues + 3)
{
/* Grab the lowest frequency. */
minfreq = numbers[nnumbers - (nnumbers - nvalues)];
mcelems = values;
mcefreqs = numbers;
nmcelems = nvalues;
}
}
}
/* Process the logical expression in the query, using the stats */
selec = int_query_opr_selec(GETQUERY(query) + query->size - 1,
mcelems, mcefreqs, nmcelems, minfreq);
/* MCE stats count only non-null rows, so adjust for null rows. */
selec *= (1.0 - nullfrac);
free_attstatsslot(INT4OID, values, nvalues, numbers, nnumbers);
ReleaseVariableStats(vardata);
CLAMP_PROBABILITY(selec);
PG_RETURN_FLOAT8((float8) selec);
}
/*
* Estimate selectivity of single intquery operator
*/
static Selectivity
int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs,
int nmcelems, float4 minfreq)
{
Selectivity selec;
/* since this function recurses, it could be driven to stack overflow */
check_stack_depth();
if (item->type == VAL)
{
Datum *searchres;
if (mcelems == NULL)
return (Selectivity) DEFAULT_EQ_SEL;
searchres = (Datum *) bsearch(&item->val, mcelems, nmcelems,
sizeof(Datum), compare_val_int4);
if (searchres)
{
/*
* The element is in MCELEM. Return precise selectivity (or at
* least as precise as ANALYZE could find out).
*/
selec = mcefreqs[searchres - mcelems];
}
else
{
/*
* The element is not in MCELEM. Punt, but assume that the
* selectivity cannot be more than minfreq / 2.
*/
selec = Min(DEFAULT_EQ_SEL, minfreq / 2);
}
}
else if (item->type == OPR)
{
/* Current query node is an operator */
Selectivity s1,
s2;
s1 = int_query_opr_selec(item - 1, mcelems, mcefreqs, nmcelems,
minfreq);
switch (item->val)
{
case (int32) '!':
selec = 1.0 - s1;
break;
case (int32) '&':
s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
nmcelems, minfreq);
selec = s1 * s2;
break;
case (int32) '|':
s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
nmcelems, minfreq);
selec = s1 + s2 - s1 * s2;
break;
default:
elog(ERROR, "unrecognized operator: %d", item->val);
selec = 0; /* keep compiler quiet */
break;
}
}
else
{
elog(ERROR, "unrecognized int query item type: %u", item->type);
selec = 0; /* keep compiler quiet */
}
/* Clamp intermediate results to stay sane despite roundoff error */
CLAMP_PROBABILITY(selec);
return selec;
}
/*
* Comparison function for binary search in mcelem array.
*/
static int
compare_val_int4(const void *a, const void *b)
{
int32 key = *(int32 *) a;
const Datum *t = (const Datum *) b;
return key - DatumGetInt32(*t);
}

View File

@ -368,6 +368,7 @@ SELECT '1&(2&(4&(5|!6)))'::query_int;
CREATE TABLE test__int( a int[] );
\copy test__int from 'data/test__int.data'
ANALYZE test__int;
SELECT count(*) from test__int WHERE a && '{23,50}';
count
-------

View File

@ -0,0 +1,49 @@
/* contrib/intarray/intarray--1.0--1.1.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "ALTER EXTENSION intarray UPDATE TO '1.1'" to load this file. \quit
CREATE FUNCTION _int_matchsel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
ALTER OPERATOR @@ (_int4, query_int) SET (RESTRICT = _int_matchsel);
ALTER OPERATOR ~~ (query_int, _int4) SET (RESTRICT = _int_matchsel);
CREATE FUNCTION _int_overlap_sel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contains_sel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contained_sel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_overlap_joinsel(internal, oid, internal, smallint, internal)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contains_joinsel(internal, oid, internal, smallint, internal)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contained_joinsel(internal, oid, internal, smallint, internal)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
ALTER OPERATOR && (_int4, _int4) SET (RESTRICT = _int_overlap_sel, JOIN = _int_overlap_joinsel);
ALTER OPERATOR @> (_int4, _int4) SET (RESTRICT = _int_contains_sel, JOIN = _int_contains_joinsel);
ALTER OPERATOR <@ (_int4, _int4) SET (RESTRICT = _int_contained_sel, JOIN = _int_contained_joinsel);
ALTER OPERATOR @ (_int4, _int4) SET (RESTRICT = _int_contains_sel, JOIN = _int_contains_joinsel);
ALTER OPERATOR ~ (_int4, _int4) SET (RESTRICT = _int_contained_sel, JOIN = _int_contained_joinsel);

View File

@ -1,4 +1,4 @@
/* contrib/intarray/intarray--1.0.sql */
/* contrib/intarray/intarray--1.1.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION intarray" to load this file. \quit
@ -45,12 +45,17 @@ LANGUAGE C STRICT IMMUTABLE;
COMMENT ON FUNCTION rboolop(query_int, _int4) IS 'boolean operation with array';
CREATE FUNCTION _int_matchsel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE OPERATOR @@ (
LEFTARG = _int4,
RIGHTARG = query_int,
PROCEDURE = boolop,
COMMUTATOR = '~~',
RESTRICT = contsel,
RESTRICT = _int_matchsel,
JOIN = contjoinsel
);
@ -59,7 +64,7 @@ CREATE OPERATOR ~~ (
RIGHTARG = _int4,
PROCEDURE = rboolop,
COMMUTATOR = '@@',
RESTRICT = contsel,
RESTRICT = _int_matchsel,
JOIN = contjoinsel
);
@ -117,6 +122,36 @@ RETURNS _int4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION _int_overlap_sel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contains_sel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contained_sel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_overlap_joinsel(internal, oid, internal, smallint, internal)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contains_joinsel(internal, oid, internal, smallint, internal)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contained_joinsel(internal, oid, internal, smallint, internal)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
--
-- OPERATORS
--
@ -126,8 +161,8 @@ CREATE OPERATOR && (
RIGHTARG = _int4,
PROCEDURE = _int_overlap,
COMMUTATOR = '&&',
RESTRICT = contsel,
JOIN = contjoinsel
RESTRICT = _int_overlap_sel,
JOIN = _int_overlap_joinsel
);
--CREATE OPERATOR = (
@ -157,8 +192,8 @@ CREATE OPERATOR @> (
RIGHTARG = _int4,
PROCEDURE = _int_contains,
COMMUTATOR = '<@',
RESTRICT = contsel,
JOIN = contjoinsel
RESTRICT = _int_contains_sel,
JOIN = _int_contains_joinsel
);
CREATE OPERATOR <@ (
@ -166,8 +201,8 @@ CREATE OPERATOR <@ (
RIGHTARG = _int4,
PROCEDURE = _int_contained,
COMMUTATOR = '@>',
RESTRICT = contsel,
JOIN = contjoinsel
RESTRICT = _int_contained_sel,
JOIN = _int_contained_joinsel
);
-- obsolete:
@ -176,8 +211,8 @@ CREATE OPERATOR @ (
RIGHTARG = _int4,
PROCEDURE = _int_contains,
COMMUTATOR = '~',
RESTRICT = contsel,
JOIN = contjoinsel
RESTRICT = _int_contains_sel,
JOIN = _int_contains_joinsel
);
CREATE OPERATOR ~ (
@ -185,8 +220,8 @@ CREATE OPERATOR ~ (
RIGHTARG = _int4,
PROCEDURE = _int_contained,
COMMUTATOR = '@',
RESTRICT = contsel,
JOIN = contjoinsel
RESTRICT = _int_contained_sel,
JOIN = _int_contained_joinsel
);
--------------

View File

@ -1,5 +1,5 @@
# intarray extension
comment = 'functions, operators, and index support for 1-D arrays of integers'
default_version = '1.0'
default_version = '1.1'
module_pathname = '$libdir/_int'
relocatable = true

View File

@ -68,8 +68,8 @@ SELECT '1&(2&(4&(5|!6)))'::query_int;
CREATE TABLE test__int( a int[] );
\copy test__int from 'data/test__int.data'
ANALYZE test__int;
SELECT count(*) from test__int WHERE a && '{23,50}';
SELECT count(*) from test__int WHERE a @@ '23|50';