1996-07-09 08:22:35 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
1999-02-14 00:22:53 +01:00
|
|
|
* hashinsert.c
|
1997-09-07 07:04:48 +02:00
|
|
|
* Item insertion in hash tables for Postgres.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2019-01-02 18:44:25 +01:00
|
|
|
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/access/hash/hashinsert.c
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "postgres.h"
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "access/hash.h"
|
2017-03-14 18:27:02 +01:00
|
|
|
#include "access/hash_xlog.h"
|
|
|
|
#include "miscadmin.h"
|
2017-03-16 03:18:56 +01:00
|
|
|
#include "storage/buf_internals.h"
|
2019-11-12 04:00:16 +01:00
|
|
|
#include "storage/lwlock.h"
|
2018-04-07 15:59:14 +02:00
|
|
|
#include "storage/predicate.h"
|
2019-11-12 04:00:16 +01:00
|
|
|
#include "utils/rel.h"
|
2003-09-05 00:06:27 +02:00
|
|
|
|
Compute XID horizon for page level index vacuum on primary.
Previously the xid horizon was only computed during WAL replay. That
had two major problems:
1) It relied on knowing what the table pointed to looks like. That was
easy enough before the introducing of tableam (we knew it had to be
heap, although some trickery around logging the heap relfilenodes
was required). But to properly handle table AMs we need
per-database catalog access to look up the AM handler, which
recovery doesn't allow.
2) Not knowing the xid horizon also makes it hard to support logical
decoding on standbys. When on a catalog table, we need to be able
to conflict with slots that have an xid horizon that's too old. But
computing the horizon by visiting the heap only works once
consistency is reached, but we always need to be able to detect
conflicts.
There's also a secondary problem, in that the current method performs
redundant work on every standby. But that's counterbalanced by
potentially computing the value when not necessary (either because
there's no standby, or because there's no connected backends).
Solve 1) and 2) by moving computation of the xid horizon to the
primary and by involving tableam in the computation of the horizon.
To address the potentially increased overhead, increase the efficiency
of the xid horizon computation for heap by sorting the tids, and
eliminating redundant buffer accesses. When prefetching is available,
additionally perform prefetching of buffers. As this is more of a
maintenance task, rather than something routinely done in every read
only query, we add an arbitrary 10 to the effective concurrency -
thereby using IO concurrency, when not globally enabled. That's
possibly not the perfect formula, but seems good enough for now.
Bumps WAL format, as latestRemovedXid is now part of the records, and
the heap's relfilenode isn't anymore.
Author: Andres Freund, Amit Khandekar, Robert Haas
Reviewed-By: Robert Haas
Discussion:
https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de
https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de
https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
|
|
|
static void _hash_vacuum_one_page(Relation rel, Relation hrel,
|
2019-05-22 19:04:48 +02:00
|
|
|
Buffer metabuf, Buffer buf);
|
2003-09-05 00:06:27 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
2006-01-26 00:26:11 +01:00
|
|
|
* _hash_doinsert() -- Handle insertion of a single index tuple.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* This routine is called by the public interface routines, hashbuild
|
2006-01-26 00:26:11 +01:00
|
|
|
* and hashinsert. By here, itup is completely filled in.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2005-03-21 02:24:04 +01:00
|
|
|
void
|
2017-03-16 03:18:56 +01:00
|
|
|
_hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
Improve hash index bucket split behavior.
Previously, the right to split a bucket was represented by a
heavyweight lock on the page number of the primary bucket page.
Unfortunately, this meant that every scan needed to take a heavyweight
lock on that bucket also, which was bad for concurrency. Instead, use
a cleanup lock on the primary bucket page to indicate the right to
begin a split, so that scans only need to retain a pin on that page,
which is they would have to acquire anyway, and which is also much
cheaper.
In addition to reducing the locking cost, this also avoids locking out
scans and inserts for the entire lifetime of the split: while the new
bucket is being populated with copies of the appropriate tuples from
the old bucket, scans and inserts can happen in parallel. There are
minor concurrency improvements for vacuum operations as well, though
the situation there is still far from ideal.
This patch also removes the unworldly assumption that a split will
never be interrupted. With the new code, a split is done in a series
of small steps and the system can pick up where it left off if it is
interrupted prior to completion. While this patch does not itself add
write-ahead logging for hash indexes, it is clearly a necessary first
step, since one of the things that could interrupt a split is the
removal of electrical power from the machine performing it.
Amit Kapila. I wrote the original design on which this patch is
based, and did a good bit of work on the comments and README through
multiple rounds of review, but all of the code is Amit's. Also
reviewed by Jesper Pedersen, Jeff Janes, and others.
Discussion: http://postgr.es/m/CAA4eK1LfzcZYxLoXS874Ad0+S-ZM60U9bwcyiUZx9mHZ-KCWhw@mail.gmail.com
2016-11-30 21:39:21 +01:00
|
|
|
Buffer buf = InvalidBuffer;
|
|
|
|
Buffer bucket_buf;
|
1997-09-08 04:41:22 +02:00
|
|
|
Buffer metabuf;
|
|
|
|
HashMetaPage metap;
|
Cache hash index's metapage in rel->rd_amcache.
This avoids a very significant amount of buffer manager traffic and
contention when scanning hash indexes, because it's no longer
necessary to lock and pin the metapage for every scan. We do need
some way of figuring out when the cache is too stale to use any more,
so that when we lock the primary bucket page to which the cached
metapage points us, we can tell whether a split has occurred since we
cached the metapage data. To do that, we use the hash_prevblkno field
in the primary bucket page, which would otherwise always be set to
InvalidBuffer.
This patch contains code so that it will continue working (although
less efficiently) with hash indexes built before this change, but
perhaps we should consider bumping the hash version and ripping out
the compatibility code. That decision can be made later, though.
Mithun Cy, reviewed by Jesper Pedersen, Amit Kapila, and by me.
Before committing, I made a number of cosmetic changes to the last
posted version of the patch, adjusted _hash_getcachedmetap to be more
careful about order of operation, and made some necessary updates to
the pageinspect documentation and regression tests.
2017-02-07 18:24:25 +01:00
|
|
|
HashMetaPage usedmetap = NULL;
|
2016-12-22 19:54:40 +01:00
|
|
|
Page metapage;
|
1997-09-08 04:41:22 +02:00
|
|
|
Page page;
|
2003-09-05 00:06:27 +02:00
|
|
|
HashPageOpaque pageopaque;
|
|
|
|
Size itemsz;
|
|
|
|
bool do_expand;
|
|
|
|
uint32 hashkey;
|
|
|
|
Bucket bucket;
|
2017-03-14 18:27:02 +01:00
|
|
|
OffsetNumber itup_off;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/*
|
2008-09-15 20:43:41 +02:00
|
|
|
* Get the hash key for the item (it's stored in the index tuple itself).
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2008-09-15 20:43:41 +02:00
|
|
|
hashkey = _hash_get_indextuple_hashkey(itup);
|
2003-09-05 00:06:27 +02:00
|
|
|
|
|
|
|
/* compute item size too */
|
2018-03-01 01:25:54 +01:00
|
|
|
itemsz = IndexTupleSize(itup);
|
2005-10-15 04:49:52 +02:00
|
|
|
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
|
|
|
|
* need to be consistent */
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Improve hash index bucket split behavior.
Previously, the right to split a bucket was represented by a
heavyweight lock on the page number of the primary bucket page.
Unfortunately, this meant that every scan needed to take a heavyweight
lock on that bucket also, which was bad for concurrency. Instead, use
a cleanup lock on the primary bucket page to indicate the right to
begin a split, so that scans only need to retain a pin on that page,
which is they would have to acquire anyway, and which is also much
cheaper.
In addition to reducing the locking cost, this also avoids locking out
scans and inserts for the entire lifetime of the split: while the new
bucket is being populated with copies of the appropriate tuples from
the old bucket, scans and inserts can happen in parallel. There are
minor concurrency improvements for vacuum operations as well, though
the situation there is still far from ideal.
This patch also removes the unworldly assumption that a split will
never be interrupted. With the new code, a split is done in a series
of small steps and the system can pick up where it left off if it is
interrupted prior to completion. While this patch does not itself add
write-ahead logging for hash indexes, it is clearly a necessary first
step, since one of the things that could interrupt a split is the
removal of electrical power from the machine performing it.
Amit Kapila. I wrote the original design on which this patch is
based, and did a good bit of work on the comments and README through
multiple rounds of review, but all of the code is Amit's. Also
reviewed by Jesper Pedersen, Jeff Janes, and others.
Discussion: http://postgr.es/m/CAA4eK1LfzcZYxLoXS874Ad0+S-ZM60U9bwcyiUZx9mHZ-KCWhw@mail.gmail.com
2016-11-30 21:39:21 +01:00
|
|
|
restart_insert:
|
Cache hash index's metapage in rel->rd_amcache.
This avoids a very significant amount of buffer manager traffic and
contention when scanning hash indexes, because it's no longer
necessary to lock and pin the metapage for every scan. We do need
some way of figuring out when the cache is too stale to use any more,
so that when we lock the primary bucket page to which the cached
metapage points us, we can tell whether a split has occurred since we
cached the metapage data. To do that, we use the hash_prevblkno field
in the primary bucket page, which would otherwise always be set to
InvalidBuffer.
This patch contains code so that it will continue working (although
less efficiently) with hash indexes built before this change, but
perhaps we should consider bumping the hash version and ripping out
the compatibility code. That decision can be made later, though.
Mithun Cy, reviewed by Jesper Pedersen, Amit Kapila, and by me.
Before committing, I made a number of cosmetic changes to the last
posted version of the patch, adjusted _hash_getcachedmetap to be more
careful about order of operation, and made some necessary updates to
the pageinspect documentation and regression tests.
2017-02-07 18:24:25 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Read the metapage. We don't lock it yet; HashMaxItemSize() will
|
2017-05-17 22:31:56 +02:00
|
|
|
* examine pd_pagesize_version, but that can't change so we can examine it
|
|
|
|
* without a lock.
|
Cache hash index's metapage in rel->rd_amcache.
This avoids a very significant amount of buffer manager traffic and
contention when scanning hash indexes, because it's no longer
necessary to lock and pin the metapage for every scan. We do need
some way of figuring out when the cache is too stale to use any more,
so that when we lock the primary bucket page to which the cached
metapage points us, we can tell whether a split has occurred since we
cached the metapage data. To do that, we use the hash_prevblkno field
in the primary bucket page, which would otherwise always be set to
InvalidBuffer.
This patch contains code so that it will continue working (although
less efficiently) with hash indexes built before this change, but
perhaps we should consider bumping the hash version and ripping out
the compatibility code. That decision can be made later, though.
Mithun Cy, reviewed by Jesper Pedersen, Amit Kapila, and by me.
Before committing, I made a number of cosmetic changes to the last
posted version of the patch, adjusted _hash_getcachedmetap to be more
careful about order of operation, and made some necessary updates to
the pageinspect documentation and regression tests.
2017-02-07 18:24:25 +01:00
|
|
|
*/
|
|
|
|
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
|
2016-12-22 19:54:40 +01:00
|
|
|
metapage = BufferGetPage(metabuf);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Check whether the item can fit on a hash page at all. (Eventually, we
|
|
|
|
* ought to try to apply TOAST methods if not.) Note that at this point,
|
|
|
|
* itemsz doesn't include the ItemId.
|
2008-09-15 20:43:41 +02:00
|
|
|
*
|
|
|
|
* XXX this is useless code if we are only storing hash keys.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2016-12-22 19:54:40 +01:00
|
|
|
if (itemsz > HashMaxItemSize(metapage))
|
2003-09-05 00:06:27 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
2014-01-23 23:18:23 +01:00
|
|
|
errmsg("index row size %zu exceeds hash maximum %zu",
|
2016-12-22 19:54:40 +01:00
|
|
|
itemsz, HashMaxItemSize(metapage)),
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:35:54 +02:00
|
|
|
errhint("Values larger than a buffer page cannot be indexed.")));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Cache hash index's metapage in rel->rd_amcache.
This avoids a very significant amount of buffer manager traffic and
contention when scanning hash indexes, because it's no longer
necessary to lock and pin the metapage for every scan. We do need
some way of figuring out when the cache is too stale to use any more,
so that when we lock the primary bucket page to which the cached
metapage points us, we can tell whether a split has occurred since we
cached the metapage data. To do that, we use the hash_prevblkno field
in the primary bucket page, which would otherwise always be set to
InvalidBuffer.
This patch contains code so that it will continue working (although
less efficiently) with hash indexes built before this change, but
perhaps we should consider bumping the hash version and ripping out
the compatibility code. That decision can be made later, though.
Mithun Cy, reviewed by Jesper Pedersen, Amit Kapila, and by me.
Before committing, I made a number of cosmetic changes to the last
posted version of the patch, adjusted _hash_getcachedmetap to be more
careful about order of operation, and made some necessary updates to
the pageinspect documentation and regression tests.
2017-02-07 18:24:25 +01:00
|
|
|
/* Lock the primary bucket page for the target bucket. */
|
|
|
|
buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE,
|
|
|
|
&usedmetap);
|
|
|
|
Assert(usedmetap != NULL);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2018-04-07 15:59:14 +02:00
|
|
|
CheckForSerializableConflictIn(rel, NULL, buf);
|
|
|
|
|
Improve hash index bucket split behavior.
Previously, the right to split a bucket was represented by a
heavyweight lock on the page number of the primary bucket page.
Unfortunately, this meant that every scan needed to take a heavyweight
lock on that bucket also, which was bad for concurrency. Instead, use
a cleanup lock on the primary bucket page to indicate the right to
begin a split, so that scans only need to retain a pin on that page,
which is they would have to acquire anyway, and which is also much
cheaper.
In addition to reducing the locking cost, this also avoids locking out
scans and inserts for the entire lifetime of the split: while the new
bucket is being populated with copies of the appropriate tuples from
the old bucket, scans and inserts can happen in parallel. There are
minor concurrency improvements for vacuum operations as well, though
the situation there is still far from ideal.
This patch also removes the unworldly assumption that a split will
never be interrupted. With the new code, a split is done in a series
of small steps and the system can pick up where it left off if it is
interrupted prior to completion. While this patch does not itself add
write-ahead logging for hash indexes, it is clearly a necessary first
step, since one of the things that could interrupt a split is the
removal of electrical power from the machine performing it.
Amit Kapila. I wrote the original design on which this patch is
based, and did a good bit of work on the comments and README through
multiple rounds of review, but all of the code is Amit's. Also
reviewed by Jesper Pedersen, Jeff Janes, and others.
Discussion: http://postgr.es/m/CAA4eK1LfzcZYxLoXS874Ad0+S-ZM60U9bwcyiUZx9mHZ-KCWhw@mail.gmail.com
2016-11-30 21:39:21 +01:00
|
|
|
/* remember the primary bucket buffer to release the pin on it at end. */
|
|
|
|
bucket_buf = buf;
|
|
|
|
|
2016-04-20 15:31:19 +02:00
|
|
|
page = BufferGetPage(buf);
|
1996-07-09 08:22:35 +02:00
|
|
|
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
Cache hash index's metapage in rel->rd_amcache.
This avoids a very significant amount of buffer manager traffic and
contention when scanning hash indexes, because it's no longer
necessary to lock and pin the metapage for every scan. We do need
some way of figuring out when the cache is too stale to use any more,
so that when we lock the primary bucket page to which the cached
metapage points us, we can tell whether a split has occurred since we
cached the metapage data. To do that, we use the hash_prevblkno field
in the primary bucket page, which would otherwise always be set to
InvalidBuffer.
This patch contains code so that it will continue working (although
less efficiently) with hash indexes built before this change, but
perhaps we should consider bumping the hash version and ripping out
the compatibility code. That decision can be made later, though.
Mithun Cy, reviewed by Jesper Pedersen, Amit Kapila, and by me.
Before committing, I made a number of cosmetic changes to the last
posted version of the patch, adjusted _hash_getcachedmetap to be more
careful about order of operation, and made some necessary updates to
the pageinspect documentation and regression tests.
2017-02-07 18:24:25 +01:00
|
|
|
bucket = pageopaque->hasho_bucket;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Improve hash index bucket split behavior.
Previously, the right to split a bucket was represented by a
heavyweight lock on the page number of the primary bucket page.
Unfortunately, this meant that every scan needed to take a heavyweight
lock on that bucket also, which was bad for concurrency. Instead, use
a cleanup lock on the primary bucket page to indicate the right to
begin a split, so that scans only need to retain a pin on that page,
which is they would have to acquire anyway, and which is also much
cheaper.
In addition to reducing the locking cost, this also avoids locking out
scans and inserts for the entire lifetime of the split: while the new
bucket is being populated with copies of the appropriate tuples from
the old bucket, scans and inserts can happen in parallel. There are
minor concurrency improvements for vacuum operations as well, though
the situation there is still far from ideal.
This patch also removes the unworldly assumption that a split will
never be interrupted. With the new code, a split is done in a series
of small steps and the system can pick up where it left off if it is
interrupted prior to completion. While this patch does not itself add
write-ahead logging for hash indexes, it is clearly a necessary first
step, since one of the things that could interrupt a split is the
removal of electrical power from the machine performing it.
Amit Kapila. I wrote the original design on which this patch is
based, and did a good bit of work on the comments and README through
multiple rounds of review, but all of the code is Amit's. Also
reviewed by Jesper Pedersen, Jeff Janes, and others.
Discussion: http://postgr.es/m/CAA4eK1LfzcZYxLoXS874Ad0+S-ZM60U9bwcyiUZx9mHZ-KCWhw@mail.gmail.com
2016-11-30 21:39:21 +01:00
|
|
|
/*
|
|
|
|
* If this bucket is in the process of being split, try to finish the
|
|
|
|
* split before inserting, because that might create room for the
|
|
|
|
* insertion to proceed without allocating an additional overflow page.
|
|
|
|
* It's only interesting to finish the split if we're trying to insert
|
|
|
|
* into the bucket from which we're removing tuples (the "old" bucket),
|
|
|
|
* not if we're trying to insert into the bucket into which tuples are
|
|
|
|
* being moved (the "new" bucket).
|
|
|
|
*/
|
|
|
|
if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf))
|
|
|
|
{
|
|
|
|
/* release the lock on bucket buffer, before completing the split. */
|
2016-12-23 13:14:37 +01:00
|
|
|
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
Improve hash index bucket split behavior.
Previously, the right to split a bucket was represented by a
heavyweight lock on the page number of the primary bucket page.
Unfortunately, this meant that every scan needed to take a heavyweight
lock on that bucket also, which was bad for concurrency. Instead, use
a cleanup lock on the primary bucket page to indicate the right to
begin a split, so that scans only need to retain a pin on that page,
which is they would have to acquire anyway, and which is also much
cheaper.
In addition to reducing the locking cost, this also avoids locking out
scans and inserts for the entire lifetime of the split: while the new
bucket is being populated with copies of the appropriate tuples from
the old bucket, scans and inserts can happen in parallel. There are
minor concurrency improvements for vacuum operations as well, though
the situation there is still far from ideal.
This patch also removes the unworldly assumption that a split will
never be interrupted. With the new code, a split is done in a series
of small steps and the system can pick up where it left off if it is
interrupted prior to completion. While this patch does not itself add
write-ahead logging for hash indexes, it is clearly a necessary first
step, since one of the things that could interrupt a split is the
removal of electrical power from the machine performing it.
Amit Kapila. I wrote the original design on which this patch is
based, and did a good bit of work on the comments and README through
multiple rounds of review, but all of the code is Amit's. Also
reviewed by Jesper Pedersen, Jeff Janes, and others.
Discussion: http://postgr.es/m/CAA4eK1LfzcZYxLoXS874Ad0+S-ZM60U9bwcyiUZx9mHZ-KCWhw@mail.gmail.com
2016-11-30 21:39:21 +01:00
|
|
|
|
Cache hash index's metapage in rel->rd_amcache.
This avoids a very significant amount of buffer manager traffic and
contention when scanning hash indexes, because it's no longer
necessary to lock and pin the metapage for every scan. We do need
some way of figuring out when the cache is too stale to use any more,
so that when we lock the primary bucket page to which the cached
metapage points us, we can tell whether a split has occurred since we
cached the metapage data. To do that, we use the hash_prevblkno field
in the primary bucket page, which would otherwise always be set to
InvalidBuffer.
This patch contains code so that it will continue working (although
less efficiently) with hash indexes built before this change, but
perhaps we should consider bumping the hash version and ripping out
the compatibility code. That decision can be made later, though.
Mithun Cy, reviewed by Jesper Pedersen, Amit Kapila, and by me.
Before committing, I made a number of cosmetic changes to the last
posted version of the patch, adjusted _hash_getcachedmetap to be more
careful about order of operation, and made some necessary updates to
the pageinspect documentation and regression tests.
2017-02-07 18:24:25 +01:00
|
|
|
_hash_finish_split(rel, metabuf, buf, bucket,
|
|
|
|
usedmetap->hashm_maxbucket,
|
|
|
|
usedmetap->hashm_highmask,
|
|
|
|
usedmetap->hashm_lowmask);
|
Improve hash index bucket split behavior.
Previously, the right to split a bucket was represented by a
heavyweight lock on the page number of the primary bucket page.
Unfortunately, this meant that every scan needed to take a heavyweight
lock on that bucket also, which was bad for concurrency. Instead, use
a cleanup lock on the primary bucket page to indicate the right to
begin a split, so that scans only need to retain a pin on that page,
which is they would have to acquire anyway, and which is also much
cheaper.
In addition to reducing the locking cost, this also avoids locking out
scans and inserts for the entire lifetime of the split: while the new
bucket is being populated with copies of the appropriate tuples from
the old bucket, scans and inserts can happen in parallel. There are
minor concurrency improvements for vacuum operations as well, though
the situation there is still far from ideal.
This patch also removes the unworldly assumption that a split will
never be interrupted. With the new code, a split is done in a series
of small steps and the system can pick up where it left off if it is
interrupted prior to completion. While this patch does not itself add
write-ahead logging for hash indexes, it is clearly a necessary first
step, since one of the things that could interrupt a split is the
removal of electrical power from the machine performing it.
Amit Kapila. I wrote the original design on which this patch is
based, and did a good bit of work on the comments and README through
multiple rounds of review, but all of the code is Amit's. Also
reviewed by Jesper Pedersen, Jeff Janes, and others.
Discussion: http://postgr.es/m/CAA4eK1LfzcZYxLoXS874Ad0+S-ZM60U9bwcyiUZx9mHZ-KCWhw@mail.gmail.com
2016-11-30 21:39:21 +01:00
|
|
|
|
|
|
|
/* release the pin on old and meta buffer. retry for insert. */
|
|
|
|
_hash_dropbuf(rel, buf);
|
|
|
|
_hash_dropbuf(rel, metabuf);
|
|
|
|
goto restart_insert;
|
|
|
|
}
|
|
|
|
|
2003-09-05 00:06:27 +02:00
|
|
|
/* Do the insertion */
|
1997-09-07 07:04:48 +02:00
|
|
|
while (PageGetFreeSpace(page) < itemsz)
|
|
|
|
{
|
2017-03-16 03:18:56 +01:00
|
|
|
BlockNumber nextblkno;
|
|
|
|
|
|
|
|
/*
|
2017-05-17 22:31:56 +02:00
|
|
|
* Check if current page has any DEAD tuples. If yes, delete these
|
|
|
|
* tuples and see if we can get a space for the new item to be
|
|
|
|
* inserted before moving to the next page in the bucket chain.
|
2017-03-16 03:18:56 +01:00
|
|
|
*/
|
|
|
|
if (H_HAS_DEAD_TUPLES(pageopaque))
|
|
|
|
{
|
|
|
|
|
|
|
|
if (IsBufferCleanupOK(buf))
|
|
|
|
{
|
Compute XID horizon for page level index vacuum on primary.
Previously the xid horizon was only computed during WAL replay. That
had two major problems:
1) It relied on knowing what the table pointed to looks like. That was
easy enough before the introducing of tableam (we knew it had to be
heap, although some trickery around logging the heap relfilenodes
was required). But to properly handle table AMs we need
per-database catalog access to look up the AM handler, which
recovery doesn't allow.
2) Not knowing the xid horizon also makes it hard to support logical
decoding on standbys. When on a catalog table, we need to be able
to conflict with slots that have an xid horizon that's too old. But
computing the horizon by visiting the heap only works once
consistency is reached, but we always need to be able to detect
conflicts.
There's also a secondary problem, in that the current method performs
redundant work on every standby. But that's counterbalanced by
potentially computing the value when not necessary (either because
there's no standby, or because there's no connected backends).
Solve 1) and 2) by moving computation of the xid horizon to the
primary and by involving tableam in the computation of the horizon.
To address the potentially increased overhead, increase the efficiency
of the xid horizon computation for heap by sorting the tids, and
eliminating redundant buffer accesses. When prefetching is available,
additionally perform prefetching of buffers. As this is more of a
maintenance task, rather than something routinely done in every read
only query, we add an arbitrary 10 to the effective concurrency -
thereby using IO concurrency, when not globally enabled. That's
possibly not the perfect formula, but seems good enough for now.
Bumps WAL format, as latestRemovedXid is now part of the records, and
the heap's relfilenode isn't anymore.
Author: Andres Freund, Amit Khandekar, Robert Haas
Reviewed-By: Robert Haas
Discussion:
https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de
https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de
https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
|
|
|
_hash_vacuum_one_page(rel, heapRel, metabuf, buf);
|
2017-03-16 03:18:56 +01:00
|
|
|
|
|
|
|
if (PageGetFreeSpace(page) >= itemsz)
|
2017-05-17 22:31:56 +02:00
|
|
|
break; /* OK, now we have enough space */
|
2017-03-16 03:18:56 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/*
|
|
|
|
* no space on this page; check for an overflow page
|
|
|
|
*/
|
2017-03-16 03:18:56 +01:00
|
|
|
nextblkno = pageopaque->hasho_nextblkno;
|
2003-09-05 00:06:27 +02:00
|
|
|
|
|
|
|
if (BlockNumberIsValid(nextblkno))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* ovfl page exists; go get it. if it doesn't have room, we'll
|
Improve hash index bucket split behavior.
Previously, the right to split a bucket was represented by a
heavyweight lock on the page number of the primary bucket page.
Unfortunately, this meant that every scan needed to take a heavyweight
lock on that bucket also, which was bad for concurrency. Instead, use
a cleanup lock on the primary bucket page to indicate the right to
begin a split, so that scans only need to retain a pin on that page,
which is they would have to acquire anyway, and which is also much
cheaper.
In addition to reducing the locking cost, this also avoids locking out
scans and inserts for the entire lifetime of the split: while the new
bucket is being populated with copies of the appropriate tuples from
the old bucket, scans and inserts can happen in parallel. There are
minor concurrency improvements for vacuum operations as well, though
the situation there is still far from ideal.
This patch also removes the unworldly assumption that a split will
never be interrupted. With the new code, a split is done in a series
of small steps and the system can pick up where it left off if it is
interrupted prior to completion. While this patch does not itself add
write-ahead logging for hash indexes, it is clearly a necessary first
step, since one of the things that could interrupt a split is the
removal of electrical power from the machine performing it.
Amit Kapila. I wrote the original design on which this patch is
based, and did a good bit of work on the comments and README through
multiple rounds of review, but all of the code is Amit's. Also
reviewed by Jesper Pedersen, Jeff Janes, and others.
Discussion: http://postgr.es/m/CAA4eK1LfzcZYxLoXS874Ad0+S-ZM60U9bwcyiUZx9mHZ-KCWhw@mail.gmail.com
2016-11-30 21:39:21 +01:00
|
|
|
* find out next pass through the loop test above. we always
|
|
|
|
* release both the lock and pin if this is an overflow page, but
|
|
|
|
* only the lock if this is the primary bucket page, since the pin
|
|
|
|
* on the primary bucket must be retained throughout the scan.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
Improve hash index bucket split behavior.
Previously, the right to split a bucket was represented by a
heavyweight lock on the page number of the primary bucket page.
Unfortunately, this meant that every scan needed to take a heavyweight
lock on that bucket also, which was bad for concurrency. Instead, use
a cleanup lock on the primary bucket page to indicate the right to
begin a split, so that scans only need to retain a pin on that page,
which is they would have to acquire anyway, and which is also much
cheaper.
In addition to reducing the locking cost, this also avoids locking out
scans and inserts for the entire lifetime of the split: while the new
bucket is being populated with copies of the appropriate tuples from
the old bucket, scans and inserts can happen in parallel. There are
minor concurrency improvements for vacuum operations as well, though
the situation there is still far from ideal.
This patch also removes the unworldly assumption that a split will
never be interrupted. With the new code, a split is done in a series
of small steps and the system can pick up where it left off if it is
interrupted prior to completion. While this patch does not itself add
write-ahead logging for hash indexes, it is clearly a necessary first
step, since one of the things that could interrupt a split is the
removal of electrical power from the machine performing it.
Amit Kapila. I wrote the original design on which this patch is
based, and did a good bit of work on the comments and README through
multiple rounds of review, but all of the code is Amit's. Also
reviewed by Jesper Pedersen, Jeff Janes, and others.
Discussion: http://postgr.es/m/CAA4eK1LfzcZYxLoXS874Ad0+S-ZM60U9bwcyiUZx9mHZ-KCWhw@mail.gmail.com
2016-11-30 21:39:21 +01:00
|
|
|
if (buf != bucket_buf)
|
|
|
|
_hash_relbuf(rel, buf);
|
|
|
|
else
|
2016-12-23 13:14:37 +01:00
|
|
|
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
2007-05-03 18:45:58 +02:00
|
|
|
buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
|
2016-04-20 15:31:19 +02:00
|
|
|
page = BufferGetPage(buf);
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* we're at the end of the bucket chain and we haven't found a
|
|
|
|
* page with enough room. allocate a new overflow page.
|
|
|
|
*/
|
2003-09-05 00:06:27 +02:00
|
|
|
|
|
|
|
/* release our write lock without modifying buffer */
|
2016-12-23 13:14:37 +01:00
|
|
|
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
2003-09-05 00:06:27 +02:00
|
|
|
|
|
|
|
/* chain to a new overflow page */
|
Improve hash index bucket split behavior.
Previously, the right to split a bucket was represented by a
heavyweight lock on the page number of the primary bucket page.
Unfortunately, this meant that every scan needed to take a heavyweight
lock on that bucket also, which was bad for concurrency. Instead, use
a cleanup lock on the primary bucket page to indicate the right to
begin a split, so that scans only need to retain a pin on that page,
which is they would have to acquire anyway, and which is also much
cheaper.
In addition to reducing the locking cost, this also avoids locking out
scans and inserts for the entire lifetime of the split: while the new
bucket is being populated with copies of the appropriate tuples from
the old bucket, scans and inserts can happen in parallel. There are
minor concurrency improvements for vacuum operations as well, though
the situation there is still far from ideal.
This patch also removes the unworldly assumption that a split will
never be interrupted. With the new code, a split is done in a series
of small steps and the system can pick up where it left off if it is
interrupted prior to completion. While this patch does not itself add
write-ahead logging for hash indexes, it is clearly a necessary first
step, since one of the things that could interrupt a split is the
removal of electrical power from the machine performing it.
Amit Kapila. I wrote the original design on which this patch is
based, and did a good bit of work on the comments and README through
multiple rounds of review, but all of the code is Amit's. Also
reviewed by Jesper Pedersen, Jeff Janes, and others.
Discussion: http://postgr.es/m/CAA4eK1LfzcZYxLoXS874Ad0+S-ZM60U9bwcyiUZx9mHZ-KCWhw@mail.gmail.com
2016-11-30 21:39:21 +01:00
|
|
|
buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false);
|
2016-04-20 15:31:19 +02:00
|
|
|
page = BufferGetPage(buf);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2003-09-05 00:06:27 +02:00
|
|
|
/* should fit now, given test above */
|
|
|
|
Assert(PageGetFreeSpace(page) >= itemsz);
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
2017-03-16 03:18:56 +01:00
|
|
|
Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE);
|
1997-09-07 07:04:48 +02:00
|
|
|
Assert(pageopaque->hasho_bucket == bucket);
|
|
|
|
}
|
|
|
|
|
2003-09-05 00:06:27 +02:00
|
|
|
/*
|
2004-08-29 07:07:03 +02:00
|
|
|
* Write-lock the metapage so we can increment the tuple count. After
|
|
|
|
* incrementing it, check to see if it's time for a split.
|
2003-09-05 00:06:27 +02:00
|
|
|
*/
|
2016-12-23 13:14:37 +01:00
|
|
|
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2017-03-14 18:27:02 +01:00
|
|
|
/* Do the update. No ereport(ERROR) until changes are logged */
|
|
|
|
START_CRIT_SECTION();
|
|
|
|
|
|
|
|
/* found page with enough space, so add the item here */
|
|
|
|
itup_off = _hash_pgaddtup(rel, buf, itemsz, itup);
|
|
|
|
MarkBufferDirty(buf);
|
|
|
|
|
|
|
|
/* metapage operations */
|
Cache hash index's metapage in rel->rd_amcache.
This avoids a very significant amount of buffer manager traffic and
contention when scanning hash indexes, because it's no longer
necessary to lock and pin the metapage for every scan. We do need
some way of figuring out when the cache is too stale to use any more,
so that when we lock the primary bucket page to which the cached
metapage points us, we can tell whether a split has occurred since we
cached the metapage data. To do that, we use the hash_prevblkno field
in the primary bucket page, which would otherwise always be set to
InvalidBuffer.
This patch contains code so that it will continue working (although
less efficiently) with hash indexes built before this change, but
perhaps we should consider bumping the hash version and ripping out
the compatibility code. That decision can be made later, though.
Mithun Cy, reviewed by Jesper Pedersen, Amit Kapila, and by me.
Before committing, I made a number of cosmetic changes to the last
posted version of the patch, adjusted _hash_getcachedmetap to be more
careful about order of operation, and made some necessary updates to
the pageinspect documentation and regression tests.
2017-02-07 18:24:25 +01:00
|
|
|
metap = HashPageGetMeta(metapage);
|
2003-09-05 00:06:27 +02:00
|
|
|
metap->hashm_ntuples += 1;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2003-09-05 00:06:27 +02:00
|
|
|
/* Make sure this stays in sync with _hash_expandtable() */
|
|
|
|
do_expand = metap->hashm_ntuples >
|
|
|
|
(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
|
|
|
|
|
2016-12-23 13:14:37 +01:00
|
|
|
MarkBufferDirty(metabuf);
|
2017-03-14 18:27:02 +01:00
|
|
|
|
|
|
|
/* XLOG stuff */
|
|
|
|
if (RelationNeedsWAL(rel))
|
|
|
|
{
|
|
|
|
xl_hash_insert xlrec;
|
|
|
|
XLogRecPtr recptr;
|
|
|
|
|
|
|
|
xlrec.offnum = itup_off;
|
|
|
|
|
|
|
|
XLogBeginInsert();
|
|
|
|
XLogRegisterData((char *) &xlrec, SizeOfHashInsert);
|
|
|
|
|
|
|
|
XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
|
|
|
|
|
|
|
|
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
|
2018-03-01 01:25:54 +01:00
|
|
|
XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
|
2017-03-14 18:27:02 +01:00
|
|
|
|
|
|
|
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT);
|
|
|
|
|
|
|
|
PageSetLSN(BufferGetPage(buf), recptr);
|
|
|
|
PageSetLSN(BufferGetPage(metabuf), recptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
END_CRIT_SECTION();
|
|
|
|
|
|
|
|
/* drop lock on metapage, but keep pin */
|
2016-12-23 13:14:37 +01:00
|
|
|
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
|
2003-09-05 00:06:27 +02:00
|
|
|
|
2017-03-14 18:27:02 +01:00
|
|
|
/*
|
|
|
|
* Release the modified page and ensure to release the pin on primary
|
|
|
|
* page.
|
|
|
|
*/
|
|
|
|
_hash_relbuf(rel, buf);
|
|
|
|
if (buf != bucket_buf)
|
|
|
|
_hash_dropbuf(rel, bucket_buf);
|
|
|
|
|
2003-09-05 00:06:27 +02:00
|
|
|
/* Attempt to split if a split is needed */
|
|
|
|
if (do_expand)
|
1997-09-07 07:04:48 +02:00
|
|
|
_hash_expandtable(rel, metabuf);
|
2003-09-05 00:06:27 +02:00
|
|
|
|
|
|
|
/* Finally drop our pin on the metapage */
|
|
|
|
_hash_dropbuf(rel, metabuf);
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* _hash_pgaddtup() -- add a tuple to a particular page in the index.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2009-11-01 22:25:25 +01:00
|
|
|
* This routine adds the tuple to the page as requested; it does not write out
|
2019-08-05 05:14:58 +02:00
|
|
|
* the page. It is an error to call this function without pin and write lock
|
|
|
|
* on the target buffer.
|
2009-11-01 22:25:25 +01:00
|
|
|
*
|
|
|
|
* Returns the offset number at which the tuple was inserted. This function
|
|
|
|
* is responsible for preserving the condition that tuples in a hash index
|
|
|
|
* page are sorted by hashkey value.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2009-11-01 22:25:25 +01:00
|
|
|
OffsetNumber
|
|
|
|
_hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
1997-09-08 04:41:22 +02:00
|
|
|
OffsetNumber itup_off;
|
|
|
|
Page page;
|
2008-09-15 20:43:41 +02:00
|
|
|
uint32 hashkey;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2005-11-06 20:29:01 +01:00
|
|
|
_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
|
2016-04-20 15:31:19 +02:00
|
|
|
page = BufferGetPage(buf);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2008-09-15 20:43:41 +02:00
|
|
|
/* Find where to insert the tuple (preserving page's hashkey ordering) */
|
|
|
|
hashkey = _hash_get_indextuple_hashkey(itup);
|
|
|
|
itup_off = _hash_binsearch(page, hashkey);
|
|
|
|
|
2007-09-20 19:56:33 +02:00
|
|
|
if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
|
2001-03-07 22:20:26 +01:00
|
|
|
== InvalidOffsetNumber)
|
2003-07-21 22:29:40 +02:00
|
|
|
elog(ERROR, "failed to add index item to \"%s\"",
|
2001-03-07 22:20:26 +01:00
|
|
|
RelationGetRelationName(rel));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1998-09-01 05:29:17 +02:00
|
|
|
return itup_off;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
2017-02-27 18:04:21 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* _hash_pgaddmultitup() -- add a tuple vector to a particular page in the
|
|
|
|
* index.
|
|
|
|
*
|
|
|
|
* This routine has same requirements for locking and tuple ordering as
|
|
|
|
* _hash_pgaddtup().
|
|
|
|
*
|
|
|
|
* Returns the offset number array at which the tuples were inserted.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
_hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
|
|
|
|
OffsetNumber *itup_offsets, uint16 nitups)
|
|
|
|
{
|
|
|
|
OffsetNumber itup_off;
|
|
|
|
Page page;
|
|
|
|
uint32 hashkey;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
|
|
|
|
for (i = 0; i < nitups; i++)
|
|
|
|
{
|
|
|
|
Size itemsize;
|
|
|
|
|
2018-03-01 01:25:54 +01:00
|
|
|
itemsize = IndexTupleSize(itups[i]);
|
2017-02-27 18:04:21 +01:00
|
|
|
itemsize = MAXALIGN(itemsize);
|
|
|
|
|
|
|
|
/* Find where to insert the tuple (preserving page's hashkey ordering) */
|
|
|
|
hashkey = _hash_get_indextuple_hashkey(itups[i]);
|
|
|
|
itup_off = _hash_binsearch(page, hashkey);
|
|
|
|
|
|
|
|
itup_offsets[i] = itup_off;
|
|
|
|
|
|
|
|
if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false)
|
|
|
|
== InvalidOffsetNumber)
|
|
|
|
elog(ERROR, "failed to add index item to \"%s\"",
|
|
|
|
RelationGetRelationName(rel));
|
|
|
|
}
|
|
|
|
}
|
2017-03-16 03:18:56 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* _hash_vacuum_one_page - vacuum just one index page.
|
|
|
|
*
|
|
|
|
* Try to remove LP_DEAD items from the given page. We must acquire cleanup
|
|
|
|
* lock on the page being modified before calling this function.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static void
|
Compute XID horizon for page level index vacuum on primary.
Previously the xid horizon was only computed during WAL replay. That
had two major problems:
1) It relied on knowing what the table pointed to looks like. That was
easy enough before the introducing of tableam (we knew it had to be
heap, although some trickery around logging the heap relfilenodes
was required). But to properly handle table AMs we need
per-database catalog access to look up the AM handler, which
recovery doesn't allow.
2) Not knowing the xid horizon also makes it hard to support logical
decoding on standbys. When on a catalog table, we need to be able
to conflict with slots that have an xid horizon that's too old. But
computing the horizon by visiting the heap only works once
consistency is reached, but we always need to be able to detect
conflicts.
There's also a secondary problem, in that the current method performs
redundant work on every standby. But that's counterbalanced by
potentially computing the value when not necessary (either because
there's no standby, or because there's no connected backends).
Solve 1) and 2) by moving computation of the xid horizon to the
primary and by involving tableam in the computation of the horizon.
To address the potentially increased overhead, increase the efficiency
of the xid horizon computation for heap by sorting the tids, and
eliminating redundant buffer accesses. When prefetching is available,
additionally perform prefetching of buffers. As this is more of a
maintenance task, rather than something routinely done in every read
only query, we add an arbitrary 10 to the effective concurrency -
thereby using IO concurrency, when not globally enabled. That's
possibly not the perfect formula, but seems good enough for now.
Bumps WAL format, as latestRemovedXid is now part of the records, and
the heap's relfilenode isn't anymore.
Author: Andres Freund, Amit Khandekar, Robert Haas
Reviewed-By: Robert Haas
Discussion:
https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de
https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de
https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
|
|
|
_hash_vacuum_one_page(Relation rel, Relation hrel, Buffer metabuf, Buffer buf)
|
2017-03-16 03:18:56 +01:00
|
|
|
{
|
2017-05-17 22:31:56 +02:00
|
|
|
OffsetNumber deletable[MaxOffsetNumber];
|
|
|
|
int ndeletable = 0;
|
2017-03-16 03:18:56 +01:00
|
|
|
OffsetNumber offnum,
|
2017-05-17 22:31:56 +02:00
|
|
|
maxoff;
|
|
|
|
Page page = BufferGetPage(buf);
|
|
|
|
HashPageOpaque pageopaque;
|
|
|
|
HashMetaPage metap;
|
2017-03-16 03:18:56 +01:00
|
|
|
|
|
|
|
/* Scan each tuple in page to see if it is marked as LP_DEAD */
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
for (offnum = FirstOffsetNumber;
|
|
|
|
offnum <= maxoff;
|
|
|
|
offnum = OffsetNumberNext(offnum))
|
|
|
|
{
|
2017-05-17 22:31:56 +02:00
|
|
|
ItemId itemId = PageGetItemId(page, offnum);
|
2017-03-16 03:18:56 +01:00
|
|
|
|
|
|
|
if (ItemIdIsDead(itemId))
|
|
|
|
deletable[ndeletable++] = offnum;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ndeletable > 0)
|
|
|
|
{
|
Compute XID horizon for page level index vacuum on primary.
Previously the xid horizon was only computed during WAL replay. That
had two major problems:
1) It relied on knowing what the table pointed to looks like. That was
easy enough before the introducing of tableam (we knew it had to be
heap, although some trickery around logging the heap relfilenodes
was required). But to properly handle table AMs we need
per-database catalog access to look up the AM handler, which
recovery doesn't allow.
2) Not knowing the xid horizon also makes it hard to support logical
decoding on standbys. When on a catalog table, we need to be able
to conflict with slots that have an xid horizon that's too old. But
computing the horizon by visiting the heap only works once
consistency is reached, but we always need to be able to detect
conflicts.
There's also a secondary problem, in that the current method performs
redundant work on every standby. But that's counterbalanced by
potentially computing the value when not necessary (either because
there's no standby, or because there's no connected backends).
Solve 1) and 2) by moving computation of the xid horizon to the
primary and by involving tableam in the computation of the horizon.
To address the potentially increased overhead, increase the efficiency
of the xid horizon computation for heap by sorting the tids, and
eliminating redundant buffer accesses. When prefetching is available,
additionally perform prefetching of buffers. As this is more of a
maintenance task, rather than something routinely done in every read
only query, we add an arbitrary 10 to the effective concurrency -
thereby using IO concurrency, when not globally enabled. That's
possibly not the perfect formula, but seems good enough for now.
Bumps WAL format, as latestRemovedXid is now part of the records, and
the heap's relfilenode isn't anymore.
Author: Andres Freund, Amit Khandekar, Robert Haas
Reviewed-By: Robert Haas
Discussion:
https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de
https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de
https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
|
|
|
TransactionId latestRemovedXid;
|
|
|
|
|
|
|
|
latestRemovedXid =
|
|
|
|
index_compute_xid_horizon_for_tuples(rel, hrel, buf,
|
|
|
|
deletable, ndeletable);
|
|
|
|
|
2017-03-16 03:18:56 +01:00
|
|
|
/*
|
2017-05-17 22:31:56 +02:00
|
|
|
* Write-lock the meta page so that we can decrement tuple count.
|
2017-03-16 03:18:56 +01:00
|
|
|
*/
|
|
|
|
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
|
|
|
|
/* No ereport(ERROR) until changes are logged */
|
|
|
|
START_CRIT_SECTION();
|
|
|
|
|
|
|
|
PageIndexMultiDelete(page, deletable, ndeletable);
|
|
|
|
|
2017-03-20 20:49:09 +01:00
|
|
|
/*
|
|
|
|
* Mark the page as not containing any LP_DEAD items. This is not
|
|
|
|
* certainly true (there might be some that have recently been marked,
|
|
|
|
* but weren't included in our target-item list), but it will almost
|
2017-05-17 22:31:56 +02:00
|
|
|
* always be true and it doesn't seem worth an additional page scan to
|
|
|
|
* check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint
|
2017-03-20 20:49:09 +01:00
|
|
|
* anyway.
|
|
|
|
*/
|
2017-03-16 03:18:56 +01:00
|
|
|
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
|
|
|
|
|
|
|
|
metap = HashPageGetMeta(BufferGetPage(metabuf));
|
2017-03-27 18:50:51 +02:00
|
|
|
metap->hashm_ntuples -= ndeletable;
|
2017-03-16 03:18:56 +01:00
|
|
|
|
|
|
|
MarkBufferDirty(buf);
|
|
|
|
MarkBufferDirty(metabuf);
|
|
|
|
|
|
|
|
/* XLOG stuff */
|
|
|
|
if (RelationNeedsWAL(rel))
|
|
|
|
{
|
2017-05-17 22:31:56 +02:00
|
|
|
xl_hash_vacuum_one_page xlrec;
|
2017-03-16 03:18:56 +01:00
|
|
|
XLogRecPtr recptr;
|
|
|
|
|
Compute XID horizon for page level index vacuum on primary.
Previously the xid horizon was only computed during WAL replay. That
had two major problems:
1) It relied on knowing what the table pointed to looks like. That was
easy enough before the introducing of tableam (we knew it had to be
heap, although some trickery around logging the heap relfilenodes
was required). But to properly handle table AMs we need
per-database catalog access to look up the AM handler, which
recovery doesn't allow.
2) Not knowing the xid horizon also makes it hard to support logical
decoding on standbys. When on a catalog table, we need to be able
to conflict with slots that have an xid horizon that's too old. But
computing the horizon by visiting the heap only works once
consistency is reached, but we always need to be able to detect
conflicts.
There's also a secondary problem, in that the current method performs
redundant work on every standby. But that's counterbalanced by
potentially computing the value when not necessary (either because
there's no standby, or because there's no connected backends).
Solve 1) and 2) by moving computation of the xid horizon to the
primary and by involving tableam in the computation of the horizon.
To address the potentially increased overhead, increase the efficiency
of the xid horizon computation for heap by sorting the tids, and
eliminating redundant buffer accesses. When prefetching is available,
additionally perform prefetching of buffers. As this is more of a
maintenance task, rather than something routinely done in every read
only query, we add an arbitrary 10 to the effective concurrency -
thereby using IO concurrency, when not globally enabled. That's
possibly not the perfect formula, but seems good enough for now.
Bumps WAL format, as latestRemovedXid is now part of the records, and
the heap's relfilenode isn't anymore.
Author: Andres Freund, Amit Khandekar, Robert Haas
Reviewed-By: Robert Haas
Discussion:
https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de
https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de
https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
|
|
|
xlrec.latestRemovedXid = latestRemovedXid;
|
2017-03-27 18:50:51 +02:00
|
|
|
xlrec.ntuples = ndeletable;
|
2017-03-16 03:18:56 +01:00
|
|
|
|
|
|
|
XLogBeginInsert();
|
2017-03-27 18:50:51 +02:00
|
|
|
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
|
2017-03-16 03:18:56 +01:00
|
|
|
XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage);
|
|
|
|
|
2017-03-27 18:50:51 +02:00
|
|
|
/*
|
2017-05-17 22:31:56 +02:00
|
|
|
* We need the target-offsets array whether or not we store the
|
|
|
|
* whole buffer, to allow us to find the latestRemovedXid on a
|
|
|
|
* standby server.
|
2017-03-27 18:50:51 +02:00
|
|
|
*/
|
|
|
|
XLogRegisterData((char *) deletable,
|
2017-05-17 22:31:56 +02:00
|
|
|
ndeletable * sizeof(OffsetNumber));
|
2017-03-16 03:18:56 +01:00
|
|
|
|
|
|
|
XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
|
|
|
|
|
|
|
|
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE);
|
|
|
|
|
|
|
|
PageSetLSN(BufferGetPage(buf), recptr);
|
|
|
|
PageSetLSN(BufferGetPage(metabuf), recptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
END_CRIT_SECTION();
|
2017-05-17 22:31:56 +02:00
|
|
|
|
2017-03-16 03:18:56 +01:00
|
|
|
/*
|
2017-05-17 22:31:56 +02:00
|
|
|
* Releasing write lock on meta page as we have updated the tuple
|
|
|
|
* count.
|
2017-03-16 03:18:56 +01:00
|
|
|
*/
|
|
|
|
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
|
|
|
|
}
|
|
|
|
}
|