Add combining characters to unaccent.rules.

Strip certain classes of combining characters, so that accents encoded this way are removed. Author: Hugh Ranalli Discussion: https://postgr.es/m/15548-cef1b3f8de190d4f%40postgresql.org
2019-02-01 15:23:01 +01:00 · 2019-02-01 15:23:01 +01:00 · 456e3718e7
parent 80579f9bb1
commit 456e3718e7
4 changed files with 157 additions and 1 deletions
--- a/contrib/unaccent/expected/unaccent.out
+++ b/contrib/unaccent/expected/unaccent.out
@ -31,6 +31,12 @@ SELECT unaccent('˃˖˗˜');
 >+-~
 (1 row)
 SELECT unaccent('À');  -- Remove combining diacritical 0x0300
 unaccent 
 ----------
 A
 (1 row)
 SELECT unaccent('unaccent', 'foobar');
 unaccent 
 ----------
@ -55,6 +61,12 @@ SELECT unaccent('unaccent', '˃˖˗˜');
 >+-~
 (1 row)
 SELECT unaccent('unaccent', 'À');
 unaccent 
 ----------
 A
 (1 row)
 SELECT ts_lexize('unaccent', 'foobar');
 ts_lexize 
 -----------
@ -79,3 +91,9 @@ SELECT ts_lexize('unaccent', '˃˖˗˜');
 {>+-~}
 (1 row)
 SELECT ts_lexize('unaccent', 'À');
 ts_lexize 
 -----------
 {A}
 (1 row)
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@ -61,8 +61,25 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
                       (0x03b1, 0x03c9),     # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
                       (0x0391, 0x03a9))     # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
 # Combining marks follow a "base" character, and result in a composite
 # character. Example: "U&'A\0300'"produces "À".There are three types of
 # combining marks: enclosing (Me), non-spacing combining (Mn), spacing
 # combining (Mc). We identify the ranges of marks we feel safe removing.
 # References:
 #   https://en.wikipedia.org/wiki/Combining_character
 #   https://www.unicode.org/charts/PDF/U0300.pdf
 #   https://www.unicode.org/charts/PDF/U20D0.pdf
 COMBINING_MARK_RANGES = ((0x0300, 0x0362),  # Mn: Accents, IPA
                         (0x20dd, 0x20E0),  # Me: Symbols
                         (0x20e2, 0x20e4),) # Me: Screen, keycap, triangle
 def print_record(codepoint, letter):
-    print (chr(codepoint) + "\t" + letter)
+    if letter:
        output = chr(codepoint) + "\t" + letter
    else:
        output = chr(codepoint)
    print(output)
 class Codepoint:
    def __init__(self, id, general_category, combining_ids):
@ -70,6 +87,16 @@ class Codepoint:
        self.general_category = general_category
        self.combining_ids = combining_ids
 def is_mark_to_remove(codepoint):
    """Return true if this is a combining mark to remove."""
    if not is_mark(codepoint):
        return False
    for begin, end in COMBINING_MARK_RANGES:
        if codepoint.id >= begin and codepoint.id <= end:
            return True
    return False
 def is_plain_letter(codepoint):
    """Return true if codepoint represents a "plain letter"."""
    for begin, end in PLAIN_LETTER_RANGES:
@ -234,6 +261,8 @@ def main(args):
                             "".join(chr(combining_codepoint.id)
                                     for combining_codepoint \
                                     in get_plain_letters(codepoint, table))))
        elif is_mark_to_remove(codepoint):
            charactersSet.add((codepoint.id, None))
    # add CLDR Latin-ASCII characters
    if not args.noLigaturesExpansion:
--- a/contrib/unaccent/sql/unaccent.sql
+++ b/contrib/unaccent/sql/unaccent.sql
@ -9,13 +9,16 @@ SELECT unaccent('foobar');
 SELECT unaccent('ёлка');
 SELECT unaccent('ЁЖИК');
 SELECT unaccent('˃˖˗˜');
 SELECT unaccent('À');  -- Remove combining diacritical 0x0300
 SELECT unaccent('unaccent', 'foobar');
 SELECT unaccent('unaccent', 'ёлка');
 SELECT unaccent('unaccent', 'ЁЖИК');
 SELECT unaccent('unaccent', '˃˖˗˜');
 SELECT unaccent('unaccent', 'À');
 SELECT ts_lexize('unaccent', 'foobar');
 SELECT ts_lexize('unaccent', 'ёлка');
 SELECT ts_lexize('unaccent', 'ЁЖИК');
 SELECT ts_lexize('unaccent', '˃˖˗˜');
 SELECT ts_lexize('unaccent', 'À');
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@ -414,6 +414,105 @@
 ˖	+
 ˗	-
 ˜	~
 ̀
 ́
 ̂
 ̃
 ̄
 ̅
 ̆
 ̇
 ̈
 ̉
 ̊
 ̋
 ̌
 ̍
 ̎
 ̏
 ̐
 ̑
 ̒
 ̓
 ̔
 ̕
 ̖
 ̗
 ̘
 ̙
 ̚
 ̛
 ̜
 ̝
 ̞
 ̟
 ̠
 ̡
 ̢
 ̣
 ̤
 ̥
 ̦
 ̧
 ̨
 ̩
 ̪
 ̫
 ̬
 ̭
 ̮
 ̯
 ̰
 ̱
 ̲
 ̳
 ̴
 ̵
 ̶
 ̷
 ̸
 ̹
 ̺
 ̻
 ̼
 ̽
 ̾
 ̿
 ̀
 ́
 ͂
 ̓
 ̈́
 ͅ
 ͆
 ͇
 ͈
 ͉
 ͊
 ͋
 ͌
 ͍
 ͎
 ͏
 ͐
 ͑
 ͒
 ͓
 ͔
 ͕
 ͖
 ͗
 ͘
 ͙
 ͚
 ͛
 ͜
 ͝
 ͞
 ͟
 ͠
 ͡
 ͢
 Ά	Α
 Έ	Ε
 Ή	Η
@ -982,6 +1081,13 @@
 ₧	Pts
 ₹	Rs
 ₺	TL
 ⃝
 ⃞
 ⃟
 ⃠
 ⃢
 ⃣
 ⃤
 ℀	a/c
 ℁	a/s
 ℂ	C
 ˖	+
 ˗	-
 ˜	~
+̀
+́
+̂
+̃
+̄
+̅
+̆
+̇
+̈
+̉
+̊
+̋
+̌
+̍
+̎
+̏
+̐
+̑
+̒
+̓
+̔
+̕
+̖
+̗
+̘
+̙
+̚
+̛
+̜
+̝
+̞
+̟
+̠
+̡
+̢
+̣
+̤
+̥
+̦
+̧
+̨
+̩
+̪
+̫
+̬
+̭
+̮
+̯
+̰
+̱
+̲
+̳
+̴
+̵
+̶
+̷
+̸
+̹
+̺
+̻
+̼
+̽
+̾
+̿
+̀
+́
+͂
+̓
+̈́
+ͅ
+͆
+͇
+͈
+͉
+͊
+͋
+͌
+͍
+͎
+͏
+͐
+͑
+͒
+͓
+͔
+͕
+͖
+͗
+͘
+͙
+͚
+͛
+͜
+͝
+͞
+͟
+͠
+͡
+͢
 Ά	Α
 Έ	Ε
 Ή	Η
 ₧	Pts
 ₹	Rs
 ₺	TL
+⃝
+⃞
+⃟
+⃠
+⃢
+⃣
+⃤
 ℀	a/c
 ℁	a/s
 ℂ	C