Improve performance of Unicode {de,re}composition in the backend

This replaces the existing binary search with two perfect hash functions
for the composition and the decomposition in the backend code, at the
cost of slightly-larger binaries there (35kB in libpgcommon_srv.a).  Per
the measurements done, this improves the speed of the recomposition and
decomposition by up to 30~40 times for the NFC and NFKC conversions,
while all other operations get at least 40% faster.  This is not as
"good" as what libicu has, but it closes the gap a lot as per the
feedback from Daniel Verite.

The decomposition table remains the same, getting used for the binary
search in the frontend code, where we care more about the size of the
libraries like libpq over performance as this gets involved only in code
paths related to the SCRAM authentication.  In consequence, note that
the perfect hash function for the recomposition needs to use a new
inverse lookup array back to to the existing decomposition table.

The size of all frontend deliverables remains unchanged, even with
--enable-debug, including libpq.

Author: John Naylor
Reviewed-by: Michael Paquier, Tom Lane
Discussion: https://postgr.es/m/CAFBsxsHUuMFCt6-pU+oG-F1==CmEp8wR+O+bRouXWu6i8kXuqA@mail.gmail.com
This commit is contained in:
Michael Paquier 2020-10-23 11:05:46 +09:00
parent 7d6d6bce43
commit 783f0cc64d
5 changed files with 3227 additions and 44 deletions

View File

@ -18,7 +18,7 @@ LIBS += $(PTHREAD_LIBS)
# By default, do nothing.
all:
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
mv $^ ../../../src/include/common/
$(MAKE) normalization-check
@ -30,6 +30,8 @@ UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt Normaliz
# Generation of conversion tables used for string normalization with
# UTF-8 strings.
unicode_norm_hashfunc.h: unicode_norm_table.h
unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt
$(PERL) generate-unicode_norm_table.pl

View File

@ -1,16 +1,22 @@
#!/usr/bin/perl
#
# Generate a composition table, using Unicode data files as input
# Generate a composition table and its lookup utilities, using Unicode data
# files as input.
#
# Input: UnicodeData.txt and CompositionExclusions.txt
# Output: unicode_norm_table.h
# Output: unicode_norm_table.h and unicode_norm_hashfunc.h
#
# Copyright (c) 2000-2020, PostgreSQL Global Development Group
use strict;
use warnings;
my $output_file = "unicode_norm_table.h";
use FindBin;
use lib "$FindBin::RealBin/../../tools/";
use PerfectHash;
my $output_table_file = "unicode_norm_table.h";
my $output_func_file = "unicode_norm_hashfunc.h";
my $FH;
@ -64,11 +70,13 @@ close $FH;
my $num_characters = scalar @characters;
# Start writing out the output file
open my $OUTPUT, '>', $output_file
or die "Could not open output file $output_file: $!\n";
# Start writing out the output files
open my $OT, '>', $output_table_file
or die "Could not open output file $output_table_file: $!\n";
open my $OF, '>', $output_func_file
or die "Could not open output file $output_func_file: $!\n";
print $OUTPUT <<HEADER;
print $OT <<HEADER;
/*-------------------------------------------------------------------------
*
* unicode_norm_table.h
@ -111,8 +119,53 @@ static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
{
HEADER
print $OF <<HEADER;
/*-------------------------------------------------------------------------
*
* unicode_norm_hashfunc.h
* Perfect hash functions used for Unicode normalization
*
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/common/unicode_norm_hashfunc.h
*
*-------------------------------------------------------------------------
*/
/*
* File auto-generated by src/common/unicode/generate-unicode_norm_table.pl,
* do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H
* here.
*/
#include "common/unicode_norm_table.h"
/* Typedef for perfect hash functions */
typedef int (*cp_hash_func) (const void *key);
/* Information for lookups with perfect hash functions */
typedef struct
{
const pg_unicode_decomposition *decomps;
cp_hash_func hash;
int num_decomps;
} pg_unicode_decompinfo;
typedef struct
{
const uint16 *inverse_lookup;
cp_hash_func hash;
int num_recomps;
} pg_unicode_recompinfo;
HEADER
my $decomp_index = 0;
my $decomp_string = "";
my @dec_cp_packed;
my $main_index = 0;
my @rec_info;
my $last_code = $characters[-1]->{code};
foreach my $char (@characters)
@ -121,6 +174,9 @@ foreach my $char (@characters)
my $class = $char->{class};
my $decomp = $char->{decomp};
# Save the code point bytes as a string in network order.
push @dec_cp_packed, pack('N', hex($char->{code}));
# The character decomposition mapping field in UnicodeData.txt is a list
# of unicode codepoints, separated by space. But it can be prefixed with
# so-called compatibility formatting tag, like "<compat>", or "<font>".
@ -163,7 +219,7 @@ foreach my $char (@characters)
{
foreach my $lcode (@composition_exclusion_codes)
{
if ($lcode eq $char->{code})
if ($lcode eq $code)
{
$flags .= " | DECOMP_NO_COMPOSE";
$comment = "in exclusion list";
@ -171,11 +227,26 @@ foreach my $char (@characters)
}
}
}
# Save info for recomposeable codepoints.
# Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C
# above! See also the inverse lookup in recompose_code() found in
# src/common/unicode_norm.c.
if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/))
{
push @rec_info,
{
code => $code,
main_index => $main_index,
first => $first_decomp,
second => $decomp_elts[0]
};
}
}
if ($decomp_size == 0)
{
print $OUTPUT "\t{0x$code, $class, 0$flags, 0}";
print $OT "\t{0x$code, $class, 0$flags, 0}";
}
elsif ($decomp_size == 1 && length($first_decomp) <= 4)
{
@ -183,12 +254,11 @@ foreach my $char (@characters)
# The decomposition consists of a single codepoint, and it fits
# in a uint16, so we can store it "inline" in the main table.
$flags .= " | DECOMP_INLINE";
print $OUTPUT "\t{0x$code, $class, 1$flags, 0x$first_decomp}";
print $OT "\t{0x$code, $class, 1$flags, 0x$first_decomp}";
}
else
{
print $OUTPUT
"\t{0x$code, $class, $decomp_size$flags, $decomp_index}";
print $OT "\t{0x$code, $class, $decomp_size$flags, $decomp_index}";
# Now save the decompositions into a dedicated area that will
# be written afterwards. First build the entry dedicated to
@ -205,25 +275,17 @@ foreach my $char (@characters)
}
# Print a comma after all items except the last one.
print $OUTPUT "," unless ($code eq $last_code);
if ($comment ne "")
{
print $OT "," unless ($code eq $last_code);
# If the line is wide already, indent the comment with one tab,
# otherwise with two. This is to make the output match the way
# pgindent would mangle it. (This is quite hacky. To do this
# properly, we should actually track how long the line is so far,
# but this works for now.)
print $OUTPUT "\t" if ($decomp_index < 10);
print $OT "\t/* $comment */" if ($comment ne "");
print $OT "\n";
print $OUTPUT "\t/* $comment */" if ($comment ne "");
}
print $OUTPUT "\n";
$main_index++;
}
print $OUTPUT "\n};\n\n";
print $OT "\n};\n\n";
# Print the array of decomposed codes.
print $OUTPUT <<HEADER;
print $OT <<HEADER;
/* codepoints array */
static const uint32 UnicodeDecomp_codepoints[$decomp_index] =
{
@ -231,4 +293,114 @@ $decomp_string
};
HEADER
close $OUTPUT;
# Emit the definition of the decomp hash function.
my $dec_funcname = 'Decomp_hash_func';
my $dec_func = PerfectHash::generate_hash_function(\@dec_cp_packed,
$dec_funcname, fixed_key_length => 4);
print $OF "/* Perfect hash function for decomposition */\n";
print $OF "static $dec_func\n";
# Emit the structure that wraps the hash lookup information into
# one variable.
print $OF <<HEADER;
/* Hash lookup information for decomposition */
static const pg_unicode_decompinfo UnicodeDecompInfo =
{
UnicodeDecompMain,
$dec_funcname,
$num_characters
};
HEADER
# Find the lowest codepoint that decomposes to each recomposeable
# code pair and create a mapping to it.
my $recomp_string = "";
my @rec_cp_packed;
my %seenit;
my $firstentry = 1;
foreach my $rec (sort recomp_sort @rec_info)
{
# The hash key is formed by concatenating the bytes of the two
# codepoints. See also recompose_code() in common/unicode_norm.c.
my $hashkey = (hex($rec->{first}) << 32) | hex($rec->{second});
# We are only interested in the lowest code point that decomposes
# to the given code pair.
next if $seenit{$hashkey};
# Save the hash key bytes in network order
push @rec_cp_packed, pack('Q>', $hashkey);
# Append inverse lookup element
$recomp_string .= ",\n" if !$firstentry;
$recomp_string .= sprintf "\t/* U+%s+%s -> U+%s */ %s",
$rec->{first},
$rec->{second},
$rec->{code},
$rec->{main_index};
$seenit{$hashkey} = 1;
$firstentry = 0;
}
# Emit the inverse lookup array containing indexes into UnicodeDecompMain.
my $num_recomps = scalar @rec_cp_packed;
print $OF <<HEADER;
/* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */
static const uint16 RecompInverseLookup[$num_recomps] =
{
$recomp_string
};
HEADER
# Emit the definition of the recomposition hash function.
my $rec_funcname = 'Recomp_hash_func';
my $rec_func =
PerfectHash::generate_hash_function(\@rec_cp_packed, $rec_funcname,
fixed_key_length => 8);
print $OF "/* Perfect hash function for recomposition */\n";
print $OF "static $rec_func\n";
# Emit the structure that wraps the hash lookup information into
# one variable.
print $OF <<HEADER;
/* Hash lookup information for recomposition */
static const pg_unicode_recompinfo UnicodeRecompInfo =
{
RecompInverseLookup,
$rec_funcname,
$num_recomps
};
HEADER
close $OT;
close $OF;
sub recomp_sort
{
my $a1 = hex($a->{first});
my $b1 = hex($b->{first});
my $a2 = hex($a->{second});
my $b2 = hex($b->{second});
# First sort by the first code point
return -1 if $a1 < $b1;
return 1 if $a1 > $b1;
# Then sort by the second code point
return -1 if $a2 < $b2;
return 1 if $a2 > $b2;
# Finally sort by the code point that decomposes into first and
# second ones.
my $acode = hex($a->{code});
my $bcode = hex($b->{code});
return -1 if $acode < $bcode;
return -1 if $acode > $bcode;
die "found duplicate entries of recomposeable code pairs";
}

View File

@ -19,9 +19,11 @@
#endif
#include "common/unicode_norm.h"
#include "common/unicode_norm_table.h"
#ifndef FRONTEND
#include "common/unicode_norm_hashfunc.h"
#include "common/unicode_normprops_table.h"
#else
#include "common/unicode_norm_table.h"
#endif
#include "port/pg_bswap.h"
@ -44,6 +46,46 @@
#define NCOUNT VCOUNT * TCOUNT
#define SCOUNT LCOUNT * NCOUNT
/*
* get_code_entry
*
* Get the entry corresponding to code in the decomposition lookup table.
* The backend version of this code uses a perfect hash function for the
* lookup, while the frontend version uses a binary search.
*/
#ifndef FRONTEND
static const pg_unicode_decomposition *
get_code_entry(pg_wchar code)
{
int h;
uint32 hashkey;
pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
/*
* Compute the hash function. The hash key is the codepoint with the bytes
* in network order.
*/
hashkey = pg_hton32(code);
h = decompinfo.hash(&hashkey);
/* An out-of-range result implies no match */
if (h < 0 || h >= decompinfo.num_decomps)
return NULL;
/*
* Since it's a perfect hash, we need only match to the specific codepoint
* it identifies.
*/
if (code != decompinfo.decomps[h].codepoint)
return NULL;
/* Success! */
return &decompinfo.decomps[h];
}
#else
/* comparison routine for bsearch() of decomposition lookup table. */
static int
conv_compare(const void *p1, const void *p2)
@ -56,10 +98,7 @@ conv_compare(const void *p1, const void *p2)
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
}
/*
* Get the entry corresponding to code in the decomposition lookup table.
*/
static pg_unicode_decomposition *
static const pg_unicode_decomposition *
get_code_entry(pg_wchar code)
{
return bsearch(&(code),
@ -69,6 +108,8 @@ get_code_entry(pg_wchar code)
conv_compare);
}
#endif /* !FRONTEND */
/*
* Given a decomposition entry looked up earlier, get the decomposed
* characters.
@ -77,7 +118,7 @@ get_code_entry(pg_wchar code)
* is only valid until next call to this function!
*/
static const pg_wchar *
get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
{
static pg_wchar x;
@ -104,7 +145,7 @@ get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
static int
get_decomposed_size(pg_wchar code, bool compat)
{
pg_unicode_decomposition *entry;
const pg_unicode_decomposition *entry;
int size = 0;
int i;
const uint32 *decomp;
@ -191,17 +232,51 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
}
else
{
int i;
const pg_unicode_decomposition *entry;
/*
* Do an inverse lookup of the decomposition tables to see if anything
* matches. The comparison just needs to be a perfect match on the
* sub-table of size two, because the start character has already been
* recomposed partially.
* recomposed partially. This lookup uses a perfect hash function for
* the backend code.
*/
#ifndef FRONTEND
int h,
inv_lookup_index;
uint64 hashkey;
pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;
/*
* Compute the hash function. The hash key is formed by concatenating
* bytes of the two codepoints in network order. See also
* src/common/unicode/generate-unicode_norm_table.pl.
*/
hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
h = recompinfo.hash(&hashkey);
/* An out-of-range result implies no match */
if (h < 0 || h >= recompinfo.num_recomps)
return false;
inv_lookup_index = recompinfo.inverse_lookup[h];
entry = &UnicodeDecompMain[inv_lookup_index];
if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
code == UnicodeDecomp_codepoints[entry->dec_index + 1])
{
*result = entry->codepoint;
return true;
}
#else
int i;
for (i = 0; i < lengthof(UnicodeDecompMain); i++)
{
const pg_unicode_decomposition *entry = &UnicodeDecompMain[i];
entry = &UnicodeDecompMain[i];
if (DECOMPOSITION_SIZE(entry) != 2)
continue;
@ -216,6 +291,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
return true;
}
}
#endif /* !FRONTEND */
}
return false;
@ -231,7 +307,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
static void
decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
{
pg_unicode_decomposition *entry;
const pg_unicode_decomposition *entry;
int i;
const uint32 *decomp;
int dec_size;
@ -358,8 +434,8 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
pg_wchar prev = decomp_chars[count - 1];
pg_wchar next = decomp_chars[count];
pg_wchar tmp;
pg_unicode_decomposition *prevEntry = get_code_entry(prev);
pg_unicode_decomposition *nextEntry = get_code_entry(next);
const pg_unicode_decomposition *prevEntry = get_code_entry(prev);
const pg_unicode_decomposition *nextEntry = get_code_entry(next);
/*
* If no entries are found, the character used is either an Hangul
@ -417,7 +493,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
for (count = 1; count < decomp_size; count++)
{
pg_wchar ch = decomp_chars[count];
pg_unicode_decomposition *ch_entry = get_code_entry(ch);
const pg_unicode_decomposition *ch_entry = get_code_entry(ch);
int ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
pg_wchar composite;
@ -458,7 +534,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
static uint8
get_canonical_class(pg_wchar ch)
{
pg_unicode_decomposition *entry = get_code_entry(ch);
const pg_unicode_decomposition *entry = get_code_entry(ch);
if (!entry)
return 0;

File diff suppressed because it is too large Load Diff

View File

@ -18,9 +18,10 @@ src/backend/utils/fmgrprotos\.h$
# they match pgindent style, they'd look worse not better, so exclude them.
kwlist_d\.h$
#
# This is generated by the scripts from src/common/unicode/. It uses
# These are generated by the scripts from src/common/unicode/. They use
# hash functions generated by PerfectHash.pm whose format looks worse with
# pgindent.
src/include/common/unicode_norm_hashfunc\.h$
src/include/common/unicode_normprops_table\.h$
#
# Exclude ecpg test files to avoid breaking the ecpg regression tests