diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index 7a0a96e04f..acfb4f0b68 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -24,9 +24,9 @@ # Latin-ASCII.xml, the latest data sets released can be browsed directly # via [3]. Note that this script is compatible with at least release 29. # -# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt -# [2] http://unicode.org/cldr/trac/export/14746/tags/release-34/common/transforms/Latin-ASCII.xml -# [3] https://unicode.org/cldr/trac/browser/tags +# [1] https://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt +# [2] https://raw.githubusercontent.com/unicode-org/cldr/release-34/common/transforms/Latin-ASCII.xml +# [3] https://github.com/unicode-org/cldr/tags # BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped # The approach is to be Python3 compatible with Python2 "backports". @@ -113,7 +113,7 @@ def is_mark(codepoint): def is_letter_with_marks(codepoint, table): """Returns true for letters combined with one or more marks.""" - # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values + # See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values # Letter may have no combining characters, in which case it has # no marks. @@ -226,7 +226,7 @@ def special_cases(): return charactersSet def main(args): - # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings + # https://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings decomposition_type_pattern = re.compile(" *<[^>]*> *") table = {} @@ -243,7 +243,7 @@ def main(args): for line in unicodeDataFile: fields = line.split(";") if len(fields) > 5: - # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt + # https://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt general_category = fields[2] decomposition = fields[5] decomposition = re.sub(decomposition_type_pattern, ' ', decomposition) @@ -281,8 +281,8 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.') - parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt. See .", type=str, required=True, dest='unicodeDataFilePath') - parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See .", type=str, dest='latinAsciiFilePath') + parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt.", type=str, required=True, dest='unicodeDataFilePath') + parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml).", type=str, dest='latinAsciiFilePath') parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion') args = parser.parse_args() diff --git a/doc/src/sgml/acronyms.sgml b/doc/src/sgml/acronyms.sgml index 411e368a9c..f638665dc9 100644 --- a/doc/src/sgml/acronyms.sgml +++ b/doc/src/sgml/acronyms.sgml @@ -728,7 +728,7 @@ UTF - Unicode Transformation + Unicode Transformation Format diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index b672da47d0..45290bd27b 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -832,12 +832,12 @@ CREATE COLLATION german (provider = libc, locale = 'de_DE'); - See Unicode + See Unicode Technical Standard #35 and BCP 47 for details. The list of possible collation types (co subtag) can be found in - the CLDR + the CLDR repository. The ICU Locale Explorer can be used to check the details of a particular locale @@ -900,7 +900,7 @@ CREATE COLLATION french FROM "fr-x-icu"; different Unicode normal forms. It is up to the collation provider to actually implement such insensitive comparisons; the deterministic flag only determines whether ties are to be broken using bytewise comparison. - See also Unicode Technical + See also Unicode Technical Standard 10 for more information on the terminology. @@ -1926,7 +1926,7 @@ RESET client_encoding; - + diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile index a97e1c6cd7..63710f9ea7 100644 --- a/src/backend/utils/mb/Unicode/Makefile +++ b/src/backend/utils/mb/Unicode/Makefile @@ -119,7 +119,7 @@ DOWNLOAD = wget -O $@ --no-use-server-timestamps #DOWNLOAD = curl -o $@ BIG5.TXT CNS11643.TXT: - $(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/$(@F) + $(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/$(@F) euc-jis-2004-std.txt sjis-0213-2004-std.txt: $(DOWNLOAD) http://x0213.org/codetable/$(@F) @@ -131,19 +131,19 @@ GB2312.TXT: $(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt' JIS0212.TXT: - $(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/$(@F) + $(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/$(@F) JOHAB.TXT KSX1001.TXT: - $(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/$(@F) + $(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/$(@F) KOI8-R.TXT KOI8-U.TXT: - $(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/$(@F) + $(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/$(@F) $(ISO8859TEXTS): - $(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/ISO8859/$(@F) + $(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/ISO8859/$(@F) $(filter-out CP8%,$(WINTEXTS)) CP932.TXT CP950.TXT: - $(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F) + $(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F) $(filter CP8%,$(WINTEXTS)): - $(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/$(@F) + $(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/$(@F) diff --git a/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl b/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl index bcdd29b686..297f7b9893 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl @@ -8,8 +8,8 @@ # map files provided by Unicode organization. # Unfortunately it is prohibited by the organization # to distribute the map files. So if you try to use this script, -# you have to obtain the map files from the organization's ftp site. -# ftp://www.unicode.org/Public/MAPPINGS/ +# you have to obtain the map files from the organization's download site. +# https://www.unicode.org/Public/MAPPINGS/ # # Our "big5" comes from BIG5.TXT, with the addition of the characters # in the range 0xf9d6-0xf9dc from CP950.TXT. diff --git a/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl b/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl index 222093dff2..8645a7ea6e 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl @@ -8,8 +8,8 @@ # map files provided by Unicode organization. # Unfortunately it is prohibited by the organization # to distribute the map files. So if you try to use this script, -# you have to obtain the map files from the organization's ftp site. -# ftp://www.unicode.org/Public/MAPPINGS/ +# you have to obtain the map files from the organization's download site. +# https://www.unicode.org/Public/MAPPINGS/ # We assume the file include three tab-separated columns: # JOHAB code in hex # UCS-2 code in hex diff --git a/src/backend/utils/mb/Unicode/UCS_to_most.pl b/src/backend/utils/mb/Unicode/UCS_to_most.pl index 647417b4bf..2290feddf4 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_most.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_most.pl @@ -8,8 +8,8 @@ # map files provided by Unicode organization. # Unfortunately it is prohibited by the organization # to distribute the map files. So if you try to use this script, -# you have to obtain the map files from the organization's ftp site. -# ftp://www.unicode.org/Public/MAPPINGS/ +# you have to obtain the map files from the organization's download site. +# https://www.unicode.org/Public/MAPPINGS/ # We assume the file include three tab-separated columns: # source character set code in hex # UCS-2 code in hex diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile index e20ef778f3..334859c984 100644 --- a/src/common/unicode/Makefile +++ b/src/common/unicode/Makefile @@ -23,7 +23,7 @@ DOWNLOAD = wget -O $@ --no-use-server-timestamps # These files are part of the Unicode Character Database. Download # them on demand. UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt: - $(DOWNLOAD) http://unicode.org/Public/UNIDATA/$(@F) + $(DOWNLOAD) https://www.unicode.org/Public/UNIDATA/$(@F) # Generation of conversion tables used for string normalization with # UTF-8 strings. diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c index 89c5533212..7509f81437 100644 --- a/src/common/unicode_norm.c +++ b/src/common/unicode_norm.c @@ -3,7 +3,7 @@ * Normalize a Unicode string to NFKC form * * This implements Unicode normalization, per the documentation at - * http://www.unicode.org/reports/tr15/. + * https://www.unicode.org/reports/tr15/. * * Portions Copyright (c) 2017-2019, PostgreSQL Global Development Group * @@ -109,7 +109,7 @@ get_decomposed_size(pg_wchar code) /* * Fast path for Hangul characters not stored in tables to save memory as * decomposition is algorithmic. See - * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on + * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details on * the matter. */ if (code >= SBASE && code < SBASE + SCOUNT) @@ -234,7 +234,7 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current) /* * Fast path for Hangul characters not stored in tables to save memory as * decomposition is algorithmic. See - * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on + * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details on * the matter. */ if (code >= SBASE && code < SBASE + SCOUNT) @@ -362,7 +362,7 @@ unicode_normalize_kc(const pg_wchar *input) continue; /* - * Per Unicode (http://unicode.org/reports/tr15/tr15-18.html) annex 4, + * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html) annex 4, * a sequence of two adjacent characters in a string is an * exchangeable pair if the combining class (from the Unicode * Character Database) for the first character is greater than the