gcc/contrib/unicode/gen_libstdcxx_unicode_data.py

#!/usr/bin/env python3
#
# Script to generate tables for libstdc++ std::format width estimation.
#
# This file is part of GCC.
#
# GCC is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later
# version.
#
# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License
# along with GCC; see the file COPYING3.  If not see
# <http://www.gnu.org/licenses/>.

# To update the Libstdc++ static data in <bits/unicode-data.h> download the latest:
# ftp://ftp.unicode.org/Public/UNIDATA/EastAsianWidth.txt
# ftp://ftp.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
# ftp://ftp.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
# ftp://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt
# Then run this script and save the output to
# ../../libstdc++-v3/include/bits/unicode-data.h

import sys
import re
import math

print("// Generated by contrib/unicode/gen_std_format_width.py, do not edit.\n")
print("#ifndef _GLIBCXX_GET_UNICODE_DATA")
print('# error "This is not a public header, do not include it directly"')
print("#elif _GLIBCXX_GET_UNICODE_DATA != 150100")
print('# error "Version mismatch for Unicode static data"')
print("#endif\n")

# Process a list and return a list of tuples (index, val) which are the elements
# in the list that have a different val from the previous element.
# e.g. find_edges([a, a, b, b, c, b, b, d]) is [(0,a), (2,b), (4,c), (5,b), (7,d)]
# and find_edges([a, a, b, b, c, b, b, d], a) is [(2,b), (4,c), (5,b), (7,d)]
def find_edges(vals, init = None):
    edges = []
    prev_val = init
    for i, v in enumerate(vals):
        if v != prev_val:
            edges.append((i,v))
            prev_val = v
    return edges

all_code_points = []

# Process a code point value or range of code point values with given property.
def process_code_points(code_points, val):
    # Example arguments:
    # 1100..115F, x
    # 232A, y

    r = code_points.split("..")
    if len(r) == 1:
        c = int(r[0], base=16)
        all_code_points[c] = val
    elif len(r) == 2:
        begin = int(r[0], base=16)
        end = int(r[1], base=16) + 1
        all_code_points[begin:end] = [val] * (end - begin)
    else:
        raise ValueError

# By default every code point has width 1. This is what the C++ standard says,
# even though the Unicode standard says some code points have width 0.
all_code_points = [1] * (1 + 0x10FFFF)

# Extract all code points with East_Asian_Width=W or East_Asian_Width=F
for line in open("EastAsianWidth.txt", "r"):
    # Example lines:
    # 3000           ; F
    # 3001..3003     ; W
    line = line.split("#")[0]
    if re.match(r'^[\dA-Fa-f][^;]+;\s*[WF]\s*$', line):
        process_code_points(line.split(";")[0], 2)

# The C++ standard also gives width 2 to the following ranges:
# U+4DC0 – U+4DFF (Yijing Hexagram Symbols)
process_code_points("4DC0..4DFF", 2)
# U+1F300 – U+1F5FF (Miscellaneous Symbols and Pictographs)
process_code_points("1F300..1F5FF", 2)
# U+1F900 – U+1F9FF (Supplemental Symbols and Pictographs)
process_code_points("1F900..1F9FF", 2)

# Create a list that only contains the code points that have a different width
# to the previous code point.
edges = find_edges(all_code_points, 1)

# Table for std::__unicode::__format_width(char32_t)

print("  // Table generated by contrib/unicode/gen_std_format_width.py,")
print("  // from EastAsianWidth.txt from the Unicode standard.");
print("  inline constexpr char32_t __width_edges[] = {", end="")
for i, e in enumerate(edges):
    if i % 8:
        print(" ", end="")
    else:
        print("\n    ", end="")
    c,_ = e
    print("{:#x},".format(c), end="")
print("\n  };\n")

# By default every code point has Grapheme_Cluster_Break=Other.
all_code_points = ["Other"] * (1 + 0x10FFFF)

# Extract Grapheme_Cluster_Break property for all code points.
for line in open("GraphemeBreakProperty.txt", "r"):
    # Example lines:
    # "0600..0605", "Prepend"
    # "00AD", "Control"
    line = line.split("#")[0]
    if re.match(r'^[\dA-Fa-f][^;]+;', line):
        code_points, grapheme_property = line.split(";")
        process_code_points(code_points, grapheme_property.strip())

edges = find_edges(all_code_points)
gcb_props = {"Other":0}
for c, p in edges:
    if p not in gcb_props:
        gcb_props[p] = len(gcb_props)
shift_bits = int(math.ceil(math.log2(len(gcb_props))))

# Enum definition for std::__unicode::_Gcb_property

print("  enum class _Gcb_property {")
for p in gcb_props.items():
    print("    _Gcb_{} = {},".format(p[0],p[1]))
print("  };\n")

# Tables for std::__unicode::_Grapheme_cluster_state

print("  // Values generated by contrib/unicode/gen_std_format_width.py,")
print("  // from GraphemeBreakProperty.txt from the Unicode standard.");
print("  // Entries are (code_point << shift_bits) + property.")
print("  inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits))
print("  inline constexpr uint32_t __gcb_edges[] = {", end="")
for i, e in enumerate(edges):
    if i % 6:
        print(" ", end="")
    else:
        print("\n    ", end="")
    c, p = e
    x = (c << shift_bits) + gcb_props[p]
    print("{0:#x},".format(x), end="")
print("\n  };\n")

# By default every code point has Indic_Conjunct_Break=None.
all_code_points = [None] * (1 + 0x10FFFF)

# Extract Indic_Conjunct_Break property for all code points.
for line in open("DerivedCoreProperties.txt", "r"):
    # Example lines:
    # 094D       ; InCB; Linker
    # 0B71       ; InCB; Consonant
    # 0300..034E ; InCB; Extend
    line = line.split("#")[0]
    if re.match(r'^[\dA-Fa-f][^;]+; InCB;', line):
        code_points, _, incb_property = line.split(";")
        process_code_points(code_points, incb_property.strip())

# Table for std::__unicode::__is_incb_linker
# This table is tiny, so just contains the list of code points.
print("  inline constexpr char32_t __incb_linkers[] = {\n   ", end="")
for i in [i for i,p in enumerate(all_code_points) if p == "Linker"]:
    print(" 0x{:04x},".format(i), end="")
    all_code_points[i] = None
print("\n  };\n")

edges = find_edges(all_code_points)

incb_props = {None:0, "Consonant":1, "Extend":2}
print("  enum class _InCB { _Consonant = 1, _Extend = 2 };\n")
# Table for std::__unicode::__incb_property
print("  // Values generated by contrib/unicode/gen_std_format_width.py,")
print("  // from DerivedCoreProperties.txt from the Unicode standard.");
print("  // Entries are (code_point << 2) + property.")
print("  inline constexpr uint32_t __incb_edges[] = {", end="")
for i, e in enumerate(edges):
    if i % 6:
        print(" ", end="")
    else:
        print("\n    ", end="")
    c, p = e
    x = (c << 2) + incb_props[p]
    print("{0:#x},".format(x), end="")
print("\n  };\n")

# By default every code point has Emoji=No.
all_code_points = [False] * (1 + 0x10FFFF)

# Extract Emoji=Extended_Pictographic for all code points.
for line in open("emoji-data.txt", "r"):
    # Example lines:
    # 1100..115F ; Extended_Pictographic
    # 232A       ; Extended_Pictographic
    line = line.split("#")[0]
    if re.match(r'^[\dA-Fa-f][^;]+; Extended_Pictographic', line):
        process_code_points(line.split(";")[0], True)

edges = find_edges(all_code_points, False)

# Table for std::__unicode::__is_extended_pictographic
print("  // Table generated by contrib/unicode/gen_std_format_width.py,")
print("  // from emoji-data.txt from the Unicode standard.");
print("  inline constexpr char32_t __xpicto_edges[] = {", end="")
for i, e in enumerate(edges):
    if i % 8:
        print(" ", end="")
    else:
        print("\n    ", end="")
    c,_ = e
    print("{:#x},".format(c), end="")
print("\n  };\n")

# <bits/unicode.h> gives an error if this macro is left defined.
# Do this last, so that the generated output is not usable unless we reach here.
print("#undef _GLIBCXX_GET_UNICODE_DATA")