kopia lustrzana git://git.postgresql.org/git/postgresql.git
Update unicode.org URLs
Use https, consistent host name, remove references to ftp. Also update the URLs for CLDR, which has moved from Trac to GitHub.
This commit is contained in:
rodzic
9abb2bfc04
commit
bdb839cbde
|
@ -24,9 +24,9 @@
|
|||
# Latin-ASCII.xml, the latest data sets released can be browsed directly
|
||||
# via [3]. Note that this script is compatible with at least release 29.
|
||||
#
|
||||
# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
|
||||
# [2] http://unicode.org/cldr/trac/export/14746/tags/release-34/common/transforms/Latin-ASCII.xml
|
||||
# [3] https://unicode.org/cldr/trac/browser/tags
|
||||
# [1] https://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt
|
||||
# [2] https://raw.githubusercontent.com/unicode-org/cldr/release-34/common/transforms/Latin-ASCII.xml
|
||||
# [3] https://github.com/unicode-org/cldr/tags
|
||||
|
||||
# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
|
||||
# The approach is to be Python3 compatible with Python2 "backports".
|
||||
|
@ -113,7 +113,7 @@ def is_mark(codepoint):
|
|||
|
||||
def is_letter_with_marks(codepoint, table):
|
||||
"""Returns true for letters combined with one or more marks."""
|
||||
# See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
|
||||
# See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
|
||||
|
||||
# Letter may have no combining characters, in which case it has
|
||||
# no marks.
|
||||
|
@ -226,7 +226,7 @@ def special_cases():
|
|||
return charactersSet
|
||||
|
||||
def main(args):
|
||||
# http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
|
||||
# https://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
|
||||
decomposition_type_pattern = re.compile(" *<[^>]*> *")
|
||||
|
||||
table = {}
|
||||
|
@ -243,7 +243,7 @@ def main(args):
|
|||
for line in unicodeDataFile:
|
||||
fields = line.split(";")
|
||||
if len(fields) > 5:
|
||||
# http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
|
||||
# https://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
|
||||
general_category = fields[2]
|
||||
decomposition = fields[5]
|
||||
decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
|
||||
|
@ -281,8 +281,8 @@ def main(args):
|
|||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
|
||||
parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt. See <http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt>.", type=str, required=True, dest='unicodeDataFilePath')
|
||||
parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See <http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml>.", type=str, dest='latinAsciiFilePath')
|
||||
parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt.", type=str, required=True, dest='unicodeDataFilePath')
|
||||
parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml).", type=str, dest='latinAsciiFilePath')
|
||||
parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
|
|
@ -728,7 +728,7 @@
|
|||
<term><acronym>UTF</acronym></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<ulink url="http://www.unicode.org/">Unicode Transformation
|
||||
<ulink url="https://www.unicode.org/">Unicode Transformation
|
||||
Format</ulink>
|
||||
</para>
|
||||
</listitem>
|
||||
|
|
|
@ -832,12 +832,12 @@ CREATE COLLATION german (provider = libc, locale = 'de_DE');
|
|||
</varlistentry>
|
||||
</variablelist>
|
||||
|
||||
See <ulink url="http://unicode.org/reports/tr35/tr35-collation.html">Unicode
|
||||
See <ulink url="https://www.unicode.org/reports/tr35/tr35-collation.html">Unicode
|
||||
Technical Standard #35</ulink>
|
||||
and <ulink url="https://tools.ietf.org/html/bcp47">BCP 47</ulink> for
|
||||
details. The list of possible collation types (<literal>co</literal>
|
||||
subtag) can be found in
|
||||
the <ulink url="http://www.unicode.org/repos/cldr/trunk/common/bcp47/collation.xml">CLDR
|
||||
the <ulink url="https://github.com/unicode-org/cldr/blob/master/common/bcp47/collation.xml">CLDR
|
||||
repository</ulink>.
|
||||
The <ulink url="https://ssl.icu-project.org/icu-bin/locexp">ICU Locale
|
||||
Explorer</ulink> can be used to check the details of a particular locale
|
||||
|
@ -900,7 +900,7 @@ CREATE COLLATION french FROM "fr-x-icu";
|
|||
different Unicode normal forms. It is up to the collation provider to
|
||||
actually implement such insensitive comparisons; the deterministic flag
|
||||
only determines whether ties are to be broken using bytewise comparison.
|
||||
See also <ulink url="https://unicode.org/reports/tr10">Unicode Technical
|
||||
See also <ulink url="https://www.unicode.org/reports/tr10">Unicode Technical
|
||||
Standard 10</ulink> for more information on the terminology.
|
||||
</para>
|
||||
|
||||
|
@ -1926,7 +1926,7 @@ RESET client_encoding;
|
|||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><ulink url="http://www.unicode.org/"></ulink></term>
|
||||
<term><ulink url="https://www.unicode.org/"></ulink></term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
|
|
|
@ -119,7 +119,7 @@ DOWNLOAD = wget -O $@ --no-use-server-timestamps
|
|||
#DOWNLOAD = curl -o $@
|
||||
|
||||
BIG5.TXT CNS11643.TXT:
|
||||
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/$(@F)
|
||||
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/$(@F)
|
||||
|
||||
euc-jis-2004-std.txt sjis-0213-2004-std.txt:
|
||||
$(DOWNLOAD) http://x0213.org/codetable/$(@F)
|
||||
|
@ -131,19 +131,19 @@ GB2312.TXT:
|
|||
$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'
|
||||
|
||||
JIS0212.TXT:
|
||||
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/$(@F)
|
||||
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/$(@F)
|
||||
|
||||
JOHAB.TXT KSX1001.TXT:
|
||||
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/$(@F)
|
||||
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/$(@F)
|
||||
|
||||
KOI8-R.TXT KOI8-U.TXT:
|
||||
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/$(@F)
|
||||
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/$(@F)
|
||||
|
||||
$(ISO8859TEXTS):
|
||||
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/ISO8859/$(@F)
|
||||
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/ISO8859/$(@F)
|
||||
|
||||
$(filter-out CP8%,$(WINTEXTS)) CP932.TXT CP950.TXT:
|
||||
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F)
|
||||
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F)
|
||||
|
||||
$(filter CP8%,$(WINTEXTS)):
|
||||
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/$(@F)
|
||||
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/$(@F)
|
||||
|
|
|
@ -8,8 +8,8 @@
|
|||
# map files provided by Unicode organization.
|
||||
# Unfortunately it is prohibited by the organization
|
||||
# to distribute the map files. So if you try to use this script,
|
||||
# you have to obtain the map files from the organization's ftp site.
|
||||
# ftp://www.unicode.org/Public/MAPPINGS/
|
||||
# you have to obtain the map files from the organization's download site.
|
||||
# https://www.unicode.org/Public/MAPPINGS/
|
||||
#
|
||||
# Our "big5" comes from BIG5.TXT, with the addition of the characters
|
||||
# in the range 0xf9d6-0xf9dc from CP950.TXT.
|
||||
|
|
|
@ -8,8 +8,8 @@
|
|||
# map files provided by Unicode organization.
|
||||
# Unfortunately it is prohibited by the organization
|
||||
# to distribute the map files. So if you try to use this script,
|
||||
# you have to obtain the map files from the organization's ftp site.
|
||||
# ftp://www.unicode.org/Public/MAPPINGS/
|
||||
# you have to obtain the map files from the organization's download site.
|
||||
# https://www.unicode.org/Public/MAPPINGS/
|
||||
# We assume the file include three tab-separated columns:
|
||||
# JOHAB code in hex
|
||||
# UCS-2 code in hex
|
||||
|
|
|
@ -8,8 +8,8 @@
|
|||
# map files provided by Unicode organization.
|
||||
# Unfortunately it is prohibited by the organization
|
||||
# to distribute the map files. So if you try to use this script,
|
||||
# you have to obtain the map files from the organization's ftp site.
|
||||
# ftp://www.unicode.org/Public/MAPPINGS/
|
||||
# you have to obtain the map files from the organization's download site.
|
||||
# https://www.unicode.org/Public/MAPPINGS/
|
||||
# We assume the file include three tab-separated columns:
|
||||
# source character set code in hex
|
||||
# UCS-2 code in hex
|
||||
|
|
|
@ -23,7 +23,7 @@ DOWNLOAD = wget -O $@ --no-use-server-timestamps
|
|||
# These files are part of the Unicode Character Database. Download
|
||||
# them on demand.
|
||||
UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt:
|
||||
$(DOWNLOAD) http://unicode.org/Public/UNIDATA/$(@F)
|
||||
$(DOWNLOAD) https://www.unicode.org/Public/UNIDATA/$(@F)
|
||||
|
||||
# Generation of conversion tables used for string normalization with
|
||||
# UTF-8 strings.
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
* Normalize a Unicode string to NFKC form
|
||||
*
|
||||
* This implements Unicode normalization, per the documentation at
|
||||
* http://www.unicode.org/reports/tr15/.
|
||||
* https://www.unicode.org/reports/tr15/.
|
||||
*
|
||||
* Portions Copyright (c) 2017-2019, PostgreSQL Global Development Group
|
||||
*
|
||||
|
@ -109,7 +109,7 @@ get_decomposed_size(pg_wchar code)
|
|||
/*
|
||||
* Fast path for Hangul characters not stored in tables to save memory as
|
||||
* decomposition is algorithmic. See
|
||||
* http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
|
||||
* https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
|
||||
* the matter.
|
||||
*/
|
||||
if (code >= SBASE && code < SBASE + SCOUNT)
|
||||
|
@ -234,7 +234,7 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
|
|||
/*
|
||||
* Fast path for Hangul characters not stored in tables to save memory as
|
||||
* decomposition is algorithmic. See
|
||||
* http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
|
||||
* https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
|
||||
* the matter.
|
||||
*/
|
||||
if (code >= SBASE && code < SBASE + SCOUNT)
|
||||
|
@ -362,7 +362,7 @@ unicode_normalize_kc(const pg_wchar *input)
|
|||
continue;
|
||||
|
||||
/*
|
||||
* Per Unicode (http://unicode.org/reports/tr15/tr15-18.html) annex 4,
|
||||
* Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html) annex 4,
|
||||
* a sequence of two adjacent characters in a string is an
|
||||
* exchangeable pair if the combining class (from the Unicode
|
||||
* Character Database) for the first character is greater than the
|
||||
|
|
Ładowanie…
Reference in New Issue