Add support for automatically updating Unicode derived files

We currently have several sets of files generated from data provided
by Unicode.  These all have ad hoc rules and instructions for updating
when new Unicode versions appear, and it's not done consistently.

This patch centralizes and automates the process and makes it part of
the release checklist.  The Unicode and CLDR versions are specified in
Makefile.global.in.  There is a new make target "update-unicode" that
downloads all the relevant files and runs the generation script.

There is also a new script for generating the table of combining
characters for ucs_wcwidth().  That table is now in a separate include
file rather than hardcoded into the middle of other code.  This is
based on the script that was used for generating
d8594d123c, but the script itself wasn't
committed at that time.

Reviewed-by: John Naylor <john.naylor@2ndquadrant.com>
Discussion: https://www.postgresql.org/message-id/flat/c8d05f42-443e-6c23-819b-05b31759a37c@2ndquadrant.com
This commit is contained in:
Peter Eisentraut 2020-01-09 09:54:47 +01:00
parent f5fd995a1a
commit f85a485f89
13 changed files with 313 additions and 94 deletions

View File

@ -75,6 +75,10 @@ $(call recurse,installcheck-world,src/test src/pl src/interfaces/ecpg contrib sr
GNUmakefile: GNUmakefile.in $(top_builddir)/config.status
./config.status $@
update-unicode: | submake-generated-headers submake-libpgport
$(MAKE) -C src/common/unicode $@
$(MAKE) -C contrib/unaccent $@
##########################################################################

View File

@ -2,3 +2,6 @@
/log/
/results/
/tmp_check/
# Downloaded files
/Latin-ASCII.xml

View File

@ -26,3 +26,22 @@ top_builddir = ../..
include $(top_builddir)/src/Makefile.global
include $(top_srcdir)/contrib/contrib-global.mk
endif
update-unicode: unaccent.rules
# Allow running this even without --with-python
PYTHON ?= python
unaccent.rules: generate_unaccent_rules.py ../../src/common/unicode/UnicodeData.txt Latin-ASCII.xml
$(PYTHON) $< --unicode-data-file $(word 2,$^) --latin-ascii-file $(word 3,$^) >$@
# Only download it once; dependencies must match src/common/unicode/
../../src/common/unicode/UnicodeData.txt: $(top_builddir)/src/Makefile.global
$(MAKE) -C $(@D) $(@F)
# Dependency on Makefile.global is for CLDR_VERSION
Latin-ASCII.xml: $(top_builddir)/src/Makefile.global
$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/cldr/release-$(subst .,-,$(CLDR_VERSION))/common/transforms/Latin-ASCII.xml
distclean:
rm -f Latin-ASCII.xml

View File

@ -20,13 +20,11 @@
# option is enabled, the XML file of this transliterator [2] -- given as a
# command line argument -- will be parsed and used.
#
# Ideally you should use the latest release for each data set. For
# Latin-ASCII.xml, the latest data sets released can be browsed directly
# via [3]. Note that this script is compatible with at least release 29.
# Ideally you should use the latest release for each data set. This
# script is compatible with at least CLDR release 29.
#
# [1] https://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt
# [2] https://raw.githubusercontent.com/unicode-org/cldr/release-34/common/transforms/Latin-ASCII.xml
# [3] https://github.com/unicode-org/cldr/tags
# [1] https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/UnicodeData.txt
# [2] https://raw.githubusercontent.com/unicode-org/cldr/${TAG}/common/transforms/Latin-ASCII.xml
# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
# The approach is to be Python3 compatible with Python2 "backports".

View File

@ -23,7 +23,7 @@ standard_targets = all install installdirs uninstall distprep clean distclean ma
# these targets should recurse even into subdirectories not being built:
standard_always_targets = distprep clean distclean maintainer-clean
.PHONY: $(standard_targets) install-strip html man installcheck-parallel
.PHONY: $(standard_targets) install-strip html man installcheck-parallel update-unicode
# make `all' the default target
all:
@ -352,6 +352,22 @@ XGETTEXT = @XGETTEXT@
GZIP = gzip
BZIP2 = bzip2
DOWNLOAD = wget -O $@ --no-use-server-timestamps
#DOWNLOAD = curl -o $@
# Unicode data information
# Before each major release, update these and run make update-unicode.
# Pick a release from here: <https://www.unicode.org/Public/>. Note
# that the most recent release listed there is often a pre-release;
# don't pick that one, except for testing.
UNICODE_VERSION = 12.1.0
# Pick a release from here: <http://cldr.unicode.org/index/downloads>
CLDR_VERSION = 34
# Tree-wide build support

View File

@ -115,9 +115,6 @@ maintainer-clean: distclean
rm -f $(MAPS)
DOWNLOAD = wget -O $@ --no-use-server-timestamps
#DOWNLOAD = curl -o $@
BIG5.TXT CNS11643.TXT:
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/$(@F)

View File

@ -643,73 +643,7 @@ mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
static int
ucs_wcwidth(pg_wchar ucs)
{
/* sorted list of non-overlapping intervals of non-spacing characters */
static const struct mbinterval combining[] = {
{0x0300, 0x036F}, {0x0483, 0x0489}, {0x0591, 0x05BD},
{0x05BF, 0x05BF}, {0x05C1, 0x05C2}, {0x05C4, 0x05C5},
{0x05C7, 0x05C7}, {0x0610, 0x061A}, {0x064B, 0x065F},
{0x0670, 0x0670}, {0x06D6, 0x06DC}, {0x06DF, 0x06E4},
{0x06E7, 0x06E8}, {0x06EA, 0x06ED}, {0x0711, 0x0711},
{0x0730, 0x074A}, {0x07A6, 0x07B0}, {0x07EB, 0x07F3},
{0x07FD, 0x07FD}, {0x0816, 0x0819}, {0x081B, 0x0823},
{0x0825, 0x0827}, {0x0829, 0x082D}, {0x0859, 0x085B},
{0x08D3, 0x08E1}, {0x08E3, 0x0902}, {0x093A, 0x093A},
{0x093C, 0x093C}, {0x0941, 0x0948}, {0x094D, 0x094D},
{0x0951, 0x0957}, {0x0962, 0x0963}, {0x0981, 0x0981},
{0x09BC, 0x09BC}, {0x09C1, 0x09C4}, {0x09CD, 0x09CD},
{0x09E2, 0x09E3}, {0x09FE, 0x0A02}, {0x0A3C, 0x0A3C},
{0x0A41, 0x0A51}, {0x0A70, 0x0A71}, {0x0A75, 0x0A75},
{0x0A81, 0x0A82}, {0x0ABC, 0x0ABC}, {0x0AC1, 0x0AC8},
{0x0ACD, 0x0ACD}, {0x0AE2, 0x0AE3}, {0x0AFA, 0x0B01},
{0x0B3C, 0x0B3C}, {0x0B3F, 0x0B3F}, {0x0B41, 0x0B44},
{0x0B4D, 0x0B56}, {0x0B62, 0x0B63}, {0x0B82, 0x0B82},
{0x0BC0, 0x0BC0}, {0x0BCD, 0x0BCD}, {0x0C00, 0x0C00},
{0x0C04, 0x0C04}, {0x0C3E, 0x0C40}, {0x0C46, 0x0C56},
{0x0C62, 0x0C63}, {0x0C81, 0x0C81}, {0x0CBC, 0x0CBC},
{0x0CBF, 0x0CBF}, {0x0CC6, 0x0CC6}, {0x0CCC, 0x0CCD},
{0x0CE2, 0x0CE3}, {0x0D00, 0x0D01}, {0x0D3B, 0x0D3C},
{0x0D41, 0x0D44}, {0x0D4D, 0x0D4D}, {0x0D62, 0x0D63},
{0x0DCA, 0x0DCA}, {0x0DD2, 0x0DD6}, {0x0E31, 0x0E31},
{0x0E34, 0x0E3A}, {0x0E47, 0x0E4E}, {0x0EB1, 0x0EB1},
{0x0EB4, 0x0EBC}, {0x0EC8, 0x0ECD}, {0x0F18, 0x0F19},
{0x0F35, 0x0F35}, {0x0F37, 0x0F37}, {0x0F39, 0x0F39},
{0x0F71, 0x0F7E}, {0x0F80, 0x0F84}, {0x0F86, 0x0F87},
{0x0F8D, 0x0FBC}, {0x0FC6, 0x0FC6}, {0x102D, 0x1030},
{0x1032, 0x1037}, {0x1039, 0x103A}, {0x103D, 0x103E},
{0x1058, 0x1059}, {0x105E, 0x1060}, {0x1071, 0x1074},
{0x1082, 0x1082}, {0x1085, 0x1086}, {0x108D, 0x108D},
{0x109D, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714},
{0x1732, 0x1734}, {0x1752, 0x1753}, {0x1772, 0x1773},
{0x17B4, 0x17B5}, {0x17B7, 0x17BD}, {0x17C6, 0x17C6},
{0x17C9, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D},
{0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x1922},
{0x1927, 0x1928}, {0x1932, 0x1932}, {0x1939, 0x193B},
{0x1A17, 0x1A18}, {0x1A1B, 0x1A1B}, {0x1A56, 0x1A56},
{0x1A58, 0x1A60}, {0x1A62, 0x1A62}, {0x1A65, 0x1A6C},
{0x1A73, 0x1A7F}, {0x1AB0, 0x1B03}, {0x1B34, 0x1B34},
{0x1B36, 0x1B3A}, {0x1B3C, 0x1B3C}, {0x1B42, 0x1B42},
{0x1B6B, 0x1B73}, {0x1B80, 0x1B81}, {0x1BA2, 0x1BA5},
{0x1BA8, 0x1BA9}, {0x1BAB, 0x1BAD}, {0x1BE6, 0x1BE6},
{0x1BE8, 0x1BE9}, {0x1BED, 0x1BED}, {0x1BEF, 0x1BF1},
{0x1C2C, 0x1C33}, {0x1C36, 0x1C37}, {0x1CD0, 0x1CD2},
{0x1CD4, 0x1CE0}, {0x1CE2, 0x1CE8}, {0x1CED, 0x1CED},
{0x1CF4, 0x1CF4}, {0x1CF8, 0x1CF9}, {0x1DC0, 0x1DFF},
{0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F},
{0x2DE0, 0x2DFF}, {0x302A, 0x302D}, {0x3099, 0x309A},
{0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F},
{0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806},
{0xA80B, 0xA80B}, {0xA825, 0xA826}, {0xA8C4, 0xA8C5},
{0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D},
{0xA947, 0xA951}, {0xA980, 0xA982}, {0xA9B3, 0xA9B3},
{0xA9B6, 0xA9B9}, {0xA9BC, 0xA9BD}, {0xA9E5, 0xA9E5},
{0xAA29, 0xAA2E}, {0xAA31, 0xAA32}, {0xAA35, 0xAA36},
{0xAA43, 0xAA43}, {0xAA4C, 0xAA4C}, {0xAA7C, 0xAA7C},
{0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8},
{0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEC, 0xAAED},
{0xAAF6, 0xAAF6}, {0xABE5, 0xABE5}, {0xABE8, 0xABE8},
{0xABED, 0xABED}, {0xFB1E, 0xFB1E}, {0xFE00, 0xFE0F},
{0xFE20, 0xFE2F},
};
#include "common/unicode_combining_table.h"
/* test for 8-bit control characters */
if (ucs == 0)

View File

@ -1,7 +1,7 @@
/norm_test
/norm_test_table.h
# Files downloaded from the Unicode Character Database
# Downloaded files
/CompositionExclusions.txt
/NormalizationTest.txt
/UnicodeData.txt

View File

@ -18,18 +18,24 @@ LIBS += $(PTHREAD_LIBS)
# By default, do nothing.
all:
DOWNLOAD = wget -O $@ --no-use-server-timestamps
update-unicode: unicode_norm_table.h unicode_combining_table.h
$(MAKE) normalization-check
mv unicode_norm_table.h unicode_combining_table.h ../../../src/include/common/
# These files are part of the Unicode Character Database. Download
# them on demand.
UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt:
$(DOWNLOAD) https://www.unicode.org/Public/UNIDATA/$(@F)
# them on demand. The dependency on Makefile.global is for
# UNICODE_VERSION.
UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
# Generation of conversion tables used for string normalization with
# UTF-8 strings.
unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt
$(PERL) generate-unicode_norm_table.pl
unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
$(PERL) $^ >$@
# Test suite
normalization-check: norm_test
./norm_test

View File

@ -8,20 +8,11 @@ of Unicode.
Generating unicode_norm_table.h
-------------------------------
1. Download the Unicode data file, UnicodeData.txt, from the Unicode
consortium and place it to the current directory. Run the perl script
"generate-unicode_norm_table.pl", to process it, and to generate the
"unicode_norm_table.h" file. The Makefile contains a rule to download the
data files if they don't exist.
make unicode_norm_table.h
2. Inspect the resulting header file. Once you're happy with it, copy it to
the right location.
cp unicode_norm_table.h ../../../src/include/common/
Run
make update-unicode
from the top level of the source tree and commit the result.
Tests
-----
@ -33,3 +24,5 @@ normalization code with all the test strings in NormalizationTest.txt.
To download NormalizationTest.txt and run the tests:
make normalization-check
This is also run as part of the update-unicode target.

View File

@ -0,0 +1,52 @@
#!/usr/bin/perl
#
# Generate sorted list of non-overlapping intervals of non-spacing
# characters, using Unicode data files as input. Pass UnicodeData.txt
# as argument. The output is on stdout.
#
# Copyright (c) 2019, PostgreSQL Global Development Group
use strict;
use warnings;
my $range_start = undef;
my $codepoint;
my $prev_codepoint;
my $count = 0;
print "/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */\n\n";
print "static const struct mbinterval combining[] = {\n";
foreach my $line (<ARGV>)
{
chomp $line;
my @fields = split ';', $line;
$codepoint = hex $fields[0];
next if $codepoint > 0xFFFF;
if ($fields[2] eq 'Me' || $fields[2] eq 'Mn')
{
# combining character, save for start of range
if (!defined($range_start))
{
$range_start = $codepoint;
}
}
else
{
# not a combining character, print out previous range if any
if (defined($range_start))
{
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_codepoint;
$range_start = undef;
}
}
}
continue
{
$prev_codepoint = $codepoint;
}
print "};\n";

View File

@ -0,0 +1,194 @@
/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */
static const struct mbinterval combining[] = {
{0x0300, 0x036F},
{0x0483, 0x0489},
{0x0591, 0x05BD},
{0x05BF, 0x05BF},
{0x05C1, 0x05C2},
{0x05C4, 0x05C5},
{0x05C7, 0x05C7},
{0x0610, 0x061A},
{0x064B, 0x065F},
{0x0670, 0x0670},
{0x06D6, 0x06DC},
{0x06DF, 0x06E4},
{0x06E7, 0x06E8},
{0x06EA, 0x06ED},
{0x0711, 0x0711},
{0x0730, 0x074A},
{0x07A6, 0x07B0},
{0x07EB, 0x07F3},
{0x07FD, 0x07FD},
{0x0816, 0x0819},
{0x081B, 0x0823},
{0x0825, 0x0827},
{0x0829, 0x082D},
{0x0859, 0x085B},
{0x08D3, 0x08E1},
{0x08E3, 0x0902},
{0x093A, 0x093A},
{0x093C, 0x093C},
{0x0941, 0x0948},
{0x094D, 0x094D},
{0x0951, 0x0957},
{0x0962, 0x0963},
{0x0981, 0x0981},
{0x09BC, 0x09BC},
{0x09C1, 0x09C4},
{0x09CD, 0x09CD},
{0x09E2, 0x09E3},
{0x09FE, 0x0A02},
{0x0A3C, 0x0A3C},
{0x0A41, 0x0A51},
{0x0A70, 0x0A71},
{0x0A75, 0x0A75},
{0x0A81, 0x0A82},
{0x0ABC, 0x0ABC},
{0x0AC1, 0x0AC8},
{0x0ACD, 0x0ACD},
{0x0AE2, 0x0AE3},
{0x0AFA, 0x0B01},
{0x0B3C, 0x0B3C},
{0x0B3F, 0x0B3F},
{0x0B41, 0x0B44},
{0x0B4D, 0x0B56},
{0x0B62, 0x0B63},
{0x0B82, 0x0B82},
{0x0BC0, 0x0BC0},
{0x0BCD, 0x0BCD},
{0x0C00, 0x0C00},
{0x0C04, 0x0C04},
{0x0C3E, 0x0C40},
{0x0C46, 0x0C56},
{0x0C62, 0x0C63},
{0x0C81, 0x0C81},
{0x0CBC, 0x0CBC},
{0x0CBF, 0x0CBF},
{0x0CC6, 0x0CC6},
{0x0CCC, 0x0CCD},
{0x0CE2, 0x0CE3},
{0x0D00, 0x0D01},
{0x0D3B, 0x0D3C},
{0x0D41, 0x0D44},
{0x0D4D, 0x0D4D},
{0x0D62, 0x0D63},
{0x0DCA, 0x0DCA},
{0x0DD2, 0x0DD6},
{0x0E31, 0x0E31},
{0x0E34, 0x0E3A},
{0x0E47, 0x0E4E},
{0x0EB1, 0x0EB1},
{0x0EB4, 0x0EBC},
{0x0EC8, 0x0ECD},
{0x0F18, 0x0F19},
{0x0F35, 0x0F35},
{0x0F37, 0x0F37},
{0x0F39, 0x0F39},
{0x0F71, 0x0F7E},
{0x0F80, 0x0F84},
{0x0F86, 0x0F87},
{0x0F8D, 0x0FBC},
{0x0FC6, 0x0FC6},
{0x102D, 0x1030},
{0x1032, 0x1037},
{0x1039, 0x103A},
{0x103D, 0x103E},
{0x1058, 0x1059},
{0x105E, 0x1060},
{0x1071, 0x1074},
{0x1082, 0x1082},
{0x1085, 0x1086},
{0x108D, 0x108D},
{0x109D, 0x109D},
{0x135D, 0x135F},
{0x1712, 0x1714},
{0x1732, 0x1734},
{0x1752, 0x1753},
{0x1772, 0x1773},
{0x17B4, 0x17B5},
{0x17B7, 0x17BD},
{0x17C6, 0x17C6},
{0x17C9, 0x17D3},
{0x17DD, 0x17DD},
{0x180B, 0x180D},
{0x1885, 0x1886},
{0x18A9, 0x18A9},
{0x1920, 0x1922},
{0x1927, 0x1928},
{0x1932, 0x1932},
{0x1939, 0x193B},
{0x1A17, 0x1A18},
{0x1A1B, 0x1A1B},
{0x1A56, 0x1A56},
{0x1A58, 0x1A60},
{0x1A62, 0x1A62},
{0x1A65, 0x1A6C},
{0x1A73, 0x1A7F},
{0x1AB0, 0x1B03},
{0x1B34, 0x1B34},
{0x1B36, 0x1B3A},
{0x1B3C, 0x1B3C},
{0x1B42, 0x1B42},
{0x1B6B, 0x1B73},
{0x1B80, 0x1B81},
{0x1BA2, 0x1BA5},
{0x1BA8, 0x1BA9},
{0x1BAB, 0x1BAD},
{0x1BE6, 0x1BE6},
{0x1BE8, 0x1BE9},
{0x1BED, 0x1BED},
{0x1BEF, 0x1BF1},
{0x1C2C, 0x1C33},
{0x1C36, 0x1C37},
{0x1CD0, 0x1CD2},
{0x1CD4, 0x1CE0},
{0x1CE2, 0x1CE8},
{0x1CED, 0x1CED},
{0x1CF4, 0x1CF4},
{0x1CF8, 0x1CF9},
{0x1DC0, 0x1DFF},
{0x20D0, 0x20F0},
{0x2CEF, 0x2CF1},
{0x2D7F, 0x2D7F},
{0x2DE0, 0x2DFF},
{0x302A, 0x302D},
{0x3099, 0x309A},
{0xA66F, 0xA672},
{0xA674, 0xA67D},
{0xA69E, 0xA69F},
{0xA6F0, 0xA6F1},
{0xA802, 0xA802},
{0xA806, 0xA806},
{0xA80B, 0xA80B},
{0xA825, 0xA826},
{0xA8C4, 0xA8C5},
{0xA8E0, 0xA8F1},
{0xA8FF, 0xA8FF},
{0xA926, 0xA92D},
{0xA947, 0xA951},
{0xA980, 0xA982},
{0xA9B3, 0xA9B3},
{0xA9B6, 0xA9B9},
{0xA9BC, 0xA9BD},
{0xA9E5, 0xA9E5},
{0xAA29, 0xAA2E},
{0xAA31, 0xAA32},
{0xAA35, 0xAA36},
{0xAA43, 0xAA43},
{0xAA4C, 0xAA4C},
{0xAA7C, 0xAA7C},
{0xAAB0, 0xAAB0},
{0xAAB2, 0xAAB4},
{0xAAB7, 0xAAB8},
{0xAABE, 0xAABF},
{0xAAC1, 0xAAC1},
{0xAAEC, 0xAAED},
{0xAAF6, 0xAAF6},
{0xABE5, 0xABE5},
{0xABE8, 0xABE8},
{0xABED, 0xABED},
{0xFB1E, 0xFB1E},
{0xFE00, 0xFE0F},
{0xFE20, 0xFE2F},
};

View File

@ -77,6 +77,9 @@ but there may be reasons to do them at other times as well.
* Update inet/cidr data types with newest Bind patches
* Update Unicode data: Edit UNICODE_VERSION and CLDR_VERSION in
src/Makefile.global.in, run make update-unicode, and commit.
Starting a New Development Cycle
================================