From 456e3718e7b72efe4d2639437fcbca2e4ad83099 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Fri, 1 Feb 2019 15:23:01 +0100 Subject: [PATCH] Add combining characters to unaccent.rules. Strip certain classes of combining characters, so that accents encoded this way are removed. Author: Hugh Ranalli Discussion: https://postgr.es/m/15548-cef1b3f8de190d4f%40postgresql.org --- contrib/unaccent/expected/unaccent.out | 18 ++++ contrib/unaccent/generate_unaccent_rules.py | 31 +++++- contrib/unaccent/sql/unaccent.sql | 3 + contrib/unaccent/unaccent.rules | 106 ++++++++++++++++++++ 4 files changed, 157 insertions(+), 1 deletion(-) diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out index 69c2cf9bd7..c1bd7cd897 100644 --- a/contrib/unaccent/expected/unaccent.out +++ b/contrib/unaccent/expected/unaccent.out @@ -31,6 +31,12 @@ SELECT unaccent('˃˖˗˜'); >+-~ (1 row) +SELECT unaccent('À'); -- Remove combining diacritical 0x0300 + unaccent +---------- + A +(1 row) + SELECT unaccent('unaccent', 'foobar'); unaccent ---------- @@ -55,6 +61,12 @@ SELECT unaccent('unaccent', '˃˖˗˜'); >+-~ (1 row) +SELECT unaccent('unaccent', 'À'); + unaccent +---------- + A +(1 row) + SELECT ts_lexize('unaccent', 'foobar'); ts_lexize ----------- @@ -79,3 +91,9 @@ SELECT ts_lexize('unaccent', '˃˖˗˜'); {>+-~} (1 row) +SELECT ts_lexize('unaccent', 'À'); + ts_lexize +----------- + {A} +(1 row) + diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index 4419a771ed..58b6e7deb7 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -61,8 +61,25 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case (0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA (0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA +# Combining marks follow a "base" character, and result in a composite +# character. Example: "U&'A\0300'"produces "À".There are three types of +# combining marks: enclosing (Me), non-spacing combining (Mn), spacing +# combining (Mc). We identify the ranges of marks we feel safe removing. +# References: +# https://en.wikipedia.org/wiki/Combining_character +# https://www.unicode.org/charts/PDF/U0300.pdf +# https://www.unicode.org/charts/PDF/U20D0.pdf +COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA + (0x20dd, 0x20E0), # Me: Symbols + (0x20e2, 0x20e4),) # Me: Screen, keycap, triangle + def print_record(codepoint, letter): - print (chr(codepoint) + "\t" + letter) + if letter: + output = chr(codepoint) + "\t" + letter + else: + output = chr(codepoint) + + print(output) class Codepoint: def __init__(self, id, general_category, combining_ids): @@ -70,6 +87,16 @@ class Codepoint: self.general_category = general_category self.combining_ids = combining_ids +def is_mark_to_remove(codepoint): + """Return true if this is a combining mark to remove.""" + if not is_mark(codepoint): + return False + + for begin, end in COMBINING_MARK_RANGES: + if codepoint.id >= begin and codepoint.id <= end: + return True + return False + def is_plain_letter(codepoint): """Return true if codepoint represents a "plain letter".""" for begin, end in PLAIN_LETTER_RANGES: @@ -234,6 +261,8 @@ def main(args): "".join(chr(combining_codepoint.id) for combining_codepoint \ in get_plain_letters(codepoint, table)))) + elif is_mark_to_remove(codepoint): + charactersSet.add((codepoint.id, None)) # add CLDR Latin-ASCII characters if not args.noLigaturesExpansion: diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql index c671827caa..2ae097ff2b 100644 --- a/contrib/unaccent/sql/unaccent.sql +++ b/contrib/unaccent/sql/unaccent.sql @@ -9,13 +9,16 @@ SELECT unaccent('foobar'); SELECT unaccent('ёлка'); SELECT unaccent('ЁЖИК'); SELECT unaccent('˃˖˗˜'); +SELECT unaccent('À'); -- Remove combining diacritical 0x0300 SELECT unaccent('unaccent', 'foobar'); SELECT unaccent('unaccent', 'ёлка'); SELECT unaccent('unaccent', 'ЁЖИК'); SELECT unaccent('unaccent', '˃˖˗˜'); +SELECT unaccent('unaccent', 'À'); SELECT ts_lexize('unaccent', 'foobar'); SELECT ts_lexize('unaccent', 'ёлка'); SELECT ts_lexize('unaccent', 'ЁЖИК'); SELECT ts_lexize('unaccent', '˃˖˗˜'); +SELECT ts_lexize('unaccent', 'À'); diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules index 7ce25eef03..99826408ac 100644 --- a/contrib/unaccent/unaccent.rules +++ b/contrib/unaccent/unaccent.rules @@ -414,6 +414,105 @@ ˖ + ˗ - ˜ ~ +̀ +́ +̂ +̃ +̄ +̅ +̆ +̇ +̈ +̉ +̊ +̋ +̌ +̍ +̎ +̏ +̐ +̑ +̒ +̓ +̔ +̕ +̖ +̗ +̘ +̙ +̚ +̛ +̜ +̝ +̞ +̟ +̠ +̡ +̢ +̣ +̤ +̥ +̦ +̧ +̨ +̩ +̪ +̫ +̬ +̭ +̮ +̯ +̰ +̱ +̲ +̳ +̴ +̵ +̶ +̷ +̸ +̹ +̺ +̻ +̼ +̽ +̾ +̿ +̀ +́ +͂ +̓ +̈́ +ͅ +͆ +͇ +͈ +͉ +͊ +͋ +͌ +͍ +͎ +͏ +͐ +͑ +͒ +͓ +͔ +͕ +͖ +͗ +͘ +͙ +͚ +͛ +͜ +͝ +͞ +͟ +͠ +͡ +͢ Ά Α Έ Ε Ή Η @@ -982,6 +1081,13 @@ ₧ Pts ₹ Rs ₺ TL +⃝ +⃞ +⃟ +⃠ +⃢ +⃣ +⃤ ℀ a/c ℁ a/s ℂ C