localedata/unicode-gen/gen_translit_combining.py

   1 #!/usr/bin/python3
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Generate a translit_combining file from a UnicodeData file.
   5 # Copyright (C) 2015-2022 Free Software Foundation, Inc.
   6 # This file is part of the GNU C Library.
   7 #
   8 # The GNU C Library is free software; you can redistribute it and/or
   9 # modify it under the terms of the GNU Lesser General Public
  10 # License as published by the Free Software Foundation; either
  11 # version 2.1 of the License, or (at your option) any later version.
  12 #
  13 # The GNU C Library is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 # Lesser General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU Lesser General Public
  19 # License along with the GNU C Library; if not, see
  20 # <https://www.gnu.org/licenses/>.
  21
  22 '''
  23 Generate a translit_combining file from UnicodeData.txt
  24
  25 To see how this script is used, call it with the “-h” option:
  26
  27     $ ./gen_translit_combining -h
  28     … prints usage message …
  29 '''
  30
  31 import argparse
  32 import time
  33 import unicode_utils
  34
  35 def read_input_file(filename):
  36     '''Reads the original glibc translit_combining file to get the
  37     original head and tail.
  38
  39     We want to replace only the part of the file between
  40     “translit_start” and “translit_end”
  41     '''
  42     head = tail = ''
  43     with open(filename, mode='r') as translit_file:
  44         for line in translit_file:
  45             head = head + line
  46             if line.startswith('translit_start'):
  47                 break
  48         for line in translit_file:
  49             if line.startswith('translit_end'):
  50                 tail = line
  51                 break
  52         for line in translit_file:
  53             tail = tail + line
  54     return (head, tail)
  55
  56 def output_head(translit_file, unicode_version, head=''):
  57     '''Write the header of the output file, i.e. the part of the file
  58     before the “translit_start” line.
  59     '''
  60     if ARGS.input_file and head:
  61         translit_file.write(head)
  62     else:
  63         translit_file.write('escape_char /\n')
  64         translit_file.write('comment_char %\n')
  65         translit_file.write(unicode_utils.COMMENT_HEADER)
  66         translit_file.write('\n')
  67         translit_file.write('% Transliterations that remove all ')
  68         translit_file.write('combining characters (accents,\n')
  69         translit_file.write('% pronounciation marks, etc.).\n')
  70         translit_file.write('% Generated automatically from UnicodeData.txt '
  71                             + 'by gen_translit_combining.py '
  72                             + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
  73                             + 'for Unicode {:s}.\n'.format(unicode_version))
  74         translit_file.write('\n')
  75         translit_file.write('LC_CTYPE\n')
  76         translit_file.write('\n')
  77         translit_file.write('translit_start\n')
  78
  79 def output_tail(translit_file, tail=''):
  80     '''Write the tail of the output file'''
  81     if ARGS.input_file and tail:
  82         translit_file.write(tail)
  83     else:
  84         translit_file.write('translit_end\n')
  85         translit_file.write('\n')
  86         translit_file.write('END LC_CTYPE\n')
  87
  88 def is_combining_remove(code_point):
  89     '''Check whether this is a combining character which should be listed
  90     in the section of the translit_combining file where combining
  91     characters are replaced by empty strings.
  92
  93     We ignore combining characters from many scripts here because
  94     the original translit_combining file didn’t do this for the
  95     combining characters from these scripts either and I am not
  96     sure yet whether this would be useful to do for all combining
  97     characters or not. For the moment I think it is better to keep
  98     close to the spirit of the original file.
  99     '''
 100     if not unicode_utils.is_combining(code_point):
 101         return False
 102     name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
 103     for substring in ('DEVANAGARI',
 104                       'BENGALI',
 105                       'CYRILLIC',
 106                       'SYRIAC',
 107                       'THAANA',
 108                       'NKO',
 109                       'GURMUKHI',
 110                       'TAMIL',
 111                       'GUJARATI',
 112                       'ORIYA',
 113                       'TELUGU',
 114                       'KANNADA',
 115                       'MALAYALAM',
 116                       'SINHALA',
 117                       'THAI',
 118                       'LAO',
 119                       'TIBETAN',
 120                       'MYANMAR',
 121                       'ETHIOPIC',
 122                       'TAGALOG',
 123                       'HANUNOO',
 124                       'BUHID',
 125                       'TAGBANWA',
 126                       'KHMER',
 127                       'MONGOLIAN',
 128                       'LIMBU',
 129                       'NEW TAI LUE',
 130                       'BUGINESE',
 131                       'BALINESE',
 132                       'SUNDANESE',
 133                       'LEPCHA',
 134                       'IDEOGRAPHIC',
 135                       'HANGUL',
 136                       'SYLOTI',
 137                       'SAURASHTRA',
 138                       'KAYAH',
 139                       'REJANG',
 140                       'CHAM',
 141                       'VARIATION SELECTOR',
 142                       'KHAROSHTHI',
 143                       'MUSICAL SYMBOL',
 144                       'SAMARITAN',
 145                       'MANDAIC',
 146                       'TAI THAM',
 147                       'BATAK',
 148                       'VEDIC',
 149                       'COPTIC',
 150                       'TIFINAGH',
 151                       'BAMUM',
 152                       'JAVANESE',
 153                       'TAI VIET',
 154                       'MEETEI',
 155                       'MANICHAEAN',
 156                       'BRAHMI',
 157                       'KAITHI',
 158                       'CHAKMA',
 159                       'MAHAJANI',
 160                       'SHARADA',
 161                       'KHOJKI',
 162                       'KHUDAWADI',
 163                       'GRANTHA',
 164                       'TIRHUTA',
 165                       'SIDDHAM',
 166                       'MODI VOWEL',
 167                       'MODI SIGN',
 168                       'TAKRI',
 169                       'BASSA VAH',
 170                       'PAHAWH HMONG',
 171                       'MIAO',
 172                       'DUPLOYAN',
 173                       'MENDE KIKAKUI',
 174                       'AHOM',
 175                       'SIGNWRITING'
 176     ):
 177         if substring in name:
 178             return False
 179     return True
 180
 181 def canonical_decompose(code_point):
 182     '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
 183
 184     In some instances a canonical mapping or a compatibility mapping
 185     may consist of a single character. For a canonical mapping, this
 186     indicates that the character is a canonical equivalent of another
 187     single character. For a compatibility mapping, this indicates that
 188     the character is a compatibility equivalent of another single
 189     character.
 190
 191     A canonical mapping may also consist of a pair of characters, but
 192     is never longer than two characters. When a canonical mapping
 193     consists of a pair of characters, the first character may itself
 194     be a character with a decomposition mapping, but the second
 195     character never has a decomposition mapping.
 196
 197     We ignore the canonical decomposition for code points
 198     matching certain substrings because the original translit_combining
 199     file didn’t include these types of characters either. I am unsure
 200     about the usefulness of including them and want to keep close
 201     to the spirit of the original file for the moment.
 202     '''
 203     name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
 204     for substring in ('MUSICAL SYMBOL',
 205                       'CJK COMPATIBILITY IDEOGRAPH',
 206                       'BALINESE',
 207                       'KAITHI LETTER',
 208                       'CHAKMA VOWEL',
 209                       'GRANTHA VOWEL',
 210                       'TIRHUTA VOWEL',
 211                       'SIDDHAM VOWEL'):
 212         if substring in name:
 213             return []
 214     decomposition = unicode_utils.UNICODE_ATTRIBUTES[
 215         code_point]['decomposition']
 216     if decomposition and not decomposition.startswith('<'):
 217         decomposed_code_points = [int(x, 16) for x in decomposition.split(' ')]
 218         if decomposed_code_points:
 219             cd0 = canonical_decompose(decomposed_code_points[0])
 220             if cd0:
 221                 decomposed_code_points = cd0 + decomposed_code_points[1:]
 222         return decomposed_code_points
 223     else:
 224         return []
 225
 226 def special_decompose(code_point_list):
 227     '''
 228     Decompositions which are not canonical or which are not in
 229     UnicodeData.txt at all but some of these were used in the original
 230     translit_combining file in glibc and they seemed to make sense.
 231     I want to keep the update of translit_combining close to the
 232     spirit of the original file, therefore I added these special
 233     decomposition rules here.
 234     '''
 235     special_decompose_dict = {
 236         # Ø U+00D8 is already handled in translit_neutral. But
 237         # translit_combining is usually included after translit_neutral
 238         # and Ǿ U+01FE LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
 239         # has a canonical decomposition to Ø U+00D8 and we want to
 240         # further decompose this to U+004F.
 241         (0x00D8,): [0x004F], # Ø → O
 242         # ø U+00F8 is already handled in translit_neutral. But
 243         # translit_combining is usually included after translit_neutral
 244         # and ǿ U+01FF LATIN SMALL LETTER O WITH STROKE AND ACUTE
 245         # has a canonical decomposition to ø U+00F8 and we want to
 246         # further decompose this to U+006F.
 247         (0x00F8,): [0x006F], # ø → o
 248         # æ U+00E6 is already in translit_compat because ligatures
 249         # are handled in translit_compat. But ǣ U+01E3 has a
 250         # canonical decomposition to U+00E6, U+0304 and we want to
 251         # further decompose this to “ae”.
 252         (0x00E6,): [0x0061, 0x0065], # æ → ae
 253         # Æ U+00C6  is already in translit_compat because ligatures
 254         # are handled in translit_compat. But Ǣ U+01E2 has a
 255         # canonical decomposition to U+00C6, U+0304 and we want to
 256         # further decompose this to “AE”
 257         (0x00C6,): [0x0041, 0x0045], # Æ → AE
 258         # U+05F2 HEBREW LIGATURE YIDDISH DOUBLE YOD is already in
 259         # translit_compat because ligatures are handled in translit_compat.
 260         # But U+FB1F has a canonical decomposition to U+05F2 and
 261         # we want to further decompose this to U+05D9, U+05D9.
 262         (0x05F2,): [0x05D9, 0x05D9], # ײ → יי
 263         # 0x2002 has a <compat> decomposition to 0x0020 in UnicodeData.txt
 264         # But U+2000 EN QUAD has a canonical decomposition U+2002
 265         # and we want to further decompose this to U+0020.
 266         (0x2002,): [0x0020], # EN SPACE → SPACE
 267         # 0x2003 has a <compat> decomposition to 0x0020 in UnicodeData.txt
 268         # But U+2001 EM QUAD has a canonical decomposition to U+2003
 269         # and we want to further decompose this to U+0020.
 270         (0x2003,): [0x0020], # EM SPACE → SPACE
 271         # U+2260 ≠ has the canonical decomposition U+003D U+0338
 272         # (= followed by ̸). After stripping the combining characters,
 273         # the result is only = which reverses the meaning.
 274         # Therefore, we add a special rules here for such mathematical
 275         # negations:
 276         (0x21AE,): [0x0021, 0x003C, 0x002D, 0x003E], # ↮ → !<->
 277         (0x21CD,): [0x0021, 0x003C, 0x003D], # ⇍ → !<=
 278         (0x21CE,): [0x0021, 0x003C, 0x003D, 0x003E], # ⇎ → !<=>
 279         (0x21CF,): [0x0021, 0x003D, 0x003E], # ⇏ → !=>
 280         (0x2204,): [0x0021, 0x2203], # ∄ → !∃
 281         (0x2209,): [0x0021, 0x2208], # ∉ → !∈
 282         (0x220C,): [0x0021, 0x220B], # ∌ → !∋
 283         (0x2224,): [0x0021, 0x2223], # ∤ → !∣
 284         (0x2226,): [0x0021, 0x2225], # ∦ → !∥
 285         (0x2241,): [0x0021, 0x007E], # ≁ → !~
 286         (0x2244,): [0x0021, 0x007E, 0x002D], # ≄ → !~-
 287         (0x2247,): [0x0021, 0x007E, 0x003D], # ≇ → !~=
 288         (0x2249,): [0x0021, 0x007E, 0x007E], # ≉ → !~~
 289         (0x2260,): [0x0021, 0x003D], # ≠ → !=
 290         (0x2262,): [0x0021, 0x003D, 0x003D], # ≢ → !==
 291         (0x226D,): [0x0021, 0x224D], # ≭ → !≍
 292         (0x226E,): [0x0021, 0x003C], # ≮ → !<
 293         (0x226F,): [0x0021, 0x003E], # ≯ → !>
 294         (0x2270,): [0x0021, 0x003C, 0x003D], # ≰ → !<=
 295         (0x2271,): [0x0021, 0x003E, 0x003D], # ≱ → !>=
 296         (0x2274,): [0x0021, 0x003C, 0x007E], # ≴ → !<~
 297         (0x2275,): [0x0021, 0x003E, 0x007E], # ≵ → !>~
 298         (0x2278,): [0x0021, 0x003C, 0x003E], # ≸ → !<>
 299         (0x2279,): [0x0021, 0x003E, 0x003C], # ≹ → !><
 300         (0x2280,): [0x0021, 0x227A], # ⊀ → !≺
 301         (0x2281,): [0x0021, 0x227B], # ⊁ → !≻
 302         (0x2284,): [0x0021, 0x2282], # ⊄ → !⊂
 303         (0x2285,): [0x0021, 0x2283], # ⊅ → !⊃
 304         (0x2288,): [0x0021, 0x2282, 0x003D], # ⊈ → !⊂=
 305         (0x2289,): [0x0021, 0x2283, 0x003D], # ⊉ → !⊃=
 306         (0x22AC,): [0x0021, 0x22A2], # ⊬ → !⊢
 307         (0x22AD,): [0x0021, 0x22A8], # ⊭ → !⊨
 308         (0x22AE,): [0x0021, 0x22A9], # ⊮ → !⊩
 309         (0x22AF,): [0x0021, 0x22AB], # ⊯ → !⊫
 310         (0x22E0,): [0x0021, 0x227C], # ⋠ → !≼
 311         (0x22E1,): [0x0021, 0x227D], # ⋡ → !≽
 312         (0x22E2,): [0x0021, 0x2291], # ⋢ → !⊑
 313         (0x22E3,): [0x0021, 0x2292], # ⋣ → !⊒
 314         (0x22EA,): [0x0021, 0x22B2], # ⋪ → !⊲
 315         (0x22EB,): [0x0021, 0x22B3], # ⋫ → !⊳
 316         (0x22EC,): [0x0021, 0x22B4], # ⋬ → !⊴
 317         (0x22ED,): [0x0021, 0x22B5], # ⋭ → !⊵
 318         (0x2ADC,): [0x0021, 0x2ADD], # ⫝̸ → !⫝
 319         # Special rule for 〈 U+3008 is added
 320         # because 〉 U+2329 has the canonical decomposition U+3008
 321         # and we want to further decompose this to > U+003C.
 322         (0x3008,): [0x003C], # 〈 → <
 323         # Special rule for 〉 U+3009 is added
 324         # because 〉 U+232A has the canonical decomposition U+3009
 325         # and we want to further decompose this to < U+003E.
 326         (0x3009,): [0x003E], # 〉→ >
 327     }
 328     if tuple(code_point_list) in special_decompose_dict:
 329         return special_decompose_dict[tuple(code_point_list)]
 330     else:
 331         return code_point_list
 332
 333 def output_combining_remove(translit_file):
 334     '''Write the section of the translit_combining file where combining
 335     characters are replaced by empty strings.
 336     '''
 337     translit_file.write('\n')
 338     for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
 339         name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
 340         if is_combining_remove(code_point):
 341             translit_file.write('% {:s}\n'.format(name))
 342             translit_file.write('{:s} ""\n'.format(
 343                 unicode_utils.ucs_symbol(code_point)))
 344     translit_file.write('\n')
 345
 346 def output_decompositions(translit_file):
 347     '''Write the section of the translit_combining file where characters
 348     characters are decomposed and combining characters stripped from
 349     the decompositions.
 350     '''
 351     for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
 352         if special_decompose([code_point]) != [code_point]:
 353             decomposed_code_points = [special_decompose([code_point])]
 354         else:
 355             decomposed_code_points = [canonical_decompose(code_point)]
 356         if decomposed_code_points[0]:
 357             while True:
 358                 special_decomposed_code_points = special_decompose(
 359                     decomposed_code_points[-1])
 360                 if (special_decomposed_code_points
 361                         != decomposed_code_points[-1]):
 362                     decomposed_code_points.append(
 363                         special_decomposed_code_points)
 364                     continue
 365                 special_decomposed_code_points = []
 366                 for decomposed_code_point in decomposed_code_points[-1]:
 367                     special_decomposed_code_points += special_decompose(
 368                         [decomposed_code_point])
 369                 if (special_decomposed_code_points
 370                         == decomposed_code_points[-1]):
 371                     break
 372                 decomposed_code_points.append(
 373                     special_decomposed_code_points)
 374             for index in range(0, len(decomposed_code_points)):
 375                 decomposed_code_points[index] = [
 376                     x for x in decomposed_code_points[index]
 377                     if not is_combining_remove(x)]
 378         if decomposed_code_points[0]:
 379             translit_file.write('% {:s}\n'.format(
 380                 unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']))
 381             translit_file.write('{:s} '.format(
 382                 unicode_utils.ucs_symbol(code_point)))
 383             for index in range(0, len(decomposed_code_points)):
 384                 if index > 0:
 385                     translit_file.write(';')
 386                 if len(decomposed_code_points[index]) > 1:
 387                     translit_file.write('"')
 388                 for decomposed_code_point in decomposed_code_points[index]:
 389                     translit_file.write('{:s}'.format(
 390                         unicode_utils.ucs_symbol(decomposed_code_point)))
 391                 if len(decomposed_code_points[index]) > 1:
 392                     translit_file.write('"')
 393             translit_file.write('\n')
 394     translit_file.write('\n')
 395
 396 def output_transliteration(translit_file):
 397     '''Write the new transliteration to the output file'''
 398     output_combining_remove(translit_file)
 399     output_decompositions(translit_file)
 400
 401 if __name__ == "__main__":
 402     PARSER = argparse.ArgumentParser(
 403         description='''
 404         Generate a translit_combining file from UnicodeData.txt.
 405         ''')
 406     PARSER.add_argument(
 407         '-u', '--unicode_data_file',
 408         nargs='?',
 409         type=str,
 410         default='UnicodeData.txt',
 411         help=('The UnicodeData.txt file to read, '
 412               + 'default: %(default)s'))
 413     PARSER.add_argument(
 414         '-i', '--input_file',
 415         nargs='?',
 416         type=str,
 417         help=''' The original glibc/localedata/locales/translit_combining
 418         file.''')
 419     PARSER.add_argument(
 420         '-o', '--output_file',
 421         nargs='?',
 422         type=str,
 423         default='translit_combining.new',
 424         help='''The new translit_combining file, default: %(default)s.  If the
 425         original glibc/localedata/locales/translit_combining file has
 426         been given as an option, the header up to the
 427         “translit_start” line and the tail from the “translit_end”
 428         line to the end of the file will be copied unchanged into the
 429         output file.  ''')
 430     PARSER.add_argument(
 431         '--unicode_version',
 432         nargs='?',
 433         required=True,
 434         type=str,
 435         help='The Unicode version of the input files used.')
 436     ARGS = PARSER.parse_args()
 437
 438     unicode_utils.fill_attributes(ARGS.unicode_data_file)
 439     HEAD = TAIL = ''
 440     if ARGS.input_file:
 441         (HEAD, TAIL) = read_input_file(ARGS.input_file)
 442     with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
 443         output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
 444         output_transliteration(TRANSLIT_FILE)
 445         output_tail(TRANSLIT_FILE, tail=TAIL)