
Fix some copy & pasted logic in __is_extended_pictographic. This function should yield false for the values before the first edge, not true. Also add a missing boundary condition check in __incb_property. Also Fix an off-by-one error in _Utf_iterator::operator++() that would make dereferencing a past-the-end iterator undefined (where the intended design is that the iterator is always incrementable and dereferenceable, for better memory safety). Also simplify the grapheme view iterator, which still contained some remnants of an earlier design I was experimenting with. Slightly tweak the gen_libstdcxx_unicode_data.py script so that the _Gcb_property enumerators are in the order we encounter them in the data file, instead of sorting them alphabetically. Start with the "Other" property at value 0, because that's the default property for anything not in the file. This makes no practical difference, but seems cleaner. It causes the values in the __gcb_edges table to change, so can only be done now before anybody is using this code yet. The enumerator values and table entries become ABI artefacts for the function using them. contrib/ChangeLog: * unicode/gen_libstdcxx_unicode_data.py: Print out Gcb_property enumerators in the order they're seen, not alphabetical order. libstdc++-v3/ChangeLog: * include/bits/unicode-data.h: Regenerate. * include/bits/unicode.h (_Utf_iterator::operator++()): Fix off by one error. (__incb_property): Add missing check for values before the first edge. (__is_extended_pictographic): Invert return values to fix copy&pasted logic. (_Grapheme_cluster_view::_Iterator): Remove second iterator member and find end of cluster lazily. * testsuite/ext/unicode/grapheme_view.cc: New test. * testsuite/ext/unicode/properties.cc: New test. * testsuite/ext/unicode/view.cc: New test.
225 lines
8 KiB
Python
Executable file
225 lines
8 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
#
|
||
# Script to generate tables for libstdc++ std::format width estimation.
|
||
#
|
||
# This file is part of GCC.
|
||
#
|
||
# GCC is free software; you can redistribute it and/or modify it under
|
||
# the terms of the GNU General Public License as published by the Free
|
||
# Software Foundation; either version 3, or (at your option) any later
|
||
# version.
|
||
#
|
||
# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||
# for more details.
|
||
#
|
||
# You should have received a copy of the GNU General Public License
|
||
# along with GCC; see the file COPYING3. If not see
|
||
# <http://www.gnu.org/licenses/>.
|
||
|
||
# To update the Libstdc++ static data in <bits/unicode-data.h> download the latest:
|
||
# ftp://ftp.unicode.org/Public/UNIDATA/EastAsianWidth.txt
|
||
# ftp://ftp.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
|
||
# ftp://ftp.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
|
||
# ftp://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt
|
||
# Then run this script and save the output to
|
||
# ../../libstdc++-v3/include/bits/unicode-data.h
|
||
|
||
import sys
|
||
import re
|
||
import math
|
||
|
||
print("// Generated by contrib/unicode/gen_std_format_width.py, do not edit.\n")
|
||
print("#ifndef _GLIBCXX_GET_UNICODE_DATA")
|
||
print('# error "This is not a public header, do not include it directly"')
|
||
print("#elif _GLIBCXX_GET_UNICODE_DATA != 150100")
|
||
print('# error "Version mismatch for Unicode static data"')
|
||
print("#endif\n")
|
||
|
||
# Process a list and return a list of tuples (index, val) which are the elements
|
||
# in the list that have a different val from the previous element.
|
||
# e.g. find_edges([a, a, b, b, c, b, b, d]) is [(0,a), (2,b), (4,c), (5,b), (7,d)]
|
||
# and find_edges([a, a, b, b, c, b, b, d], a) is [(2,b), (4,c), (5,b), (7,d)]
|
||
def find_edges(vals, init = None):
|
||
edges = []
|
||
prev_val = init
|
||
for i, v in enumerate(vals):
|
||
if v != prev_val:
|
||
edges.append((i,v))
|
||
prev_val = v
|
||
return edges
|
||
|
||
all_code_points = []
|
||
|
||
# Process a code point value or range of code point values with given property.
|
||
def process_code_points(code_points, val):
|
||
# Example arguments:
|
||
# 1100..115F, x
|
||
# 232A, y
|
||
|
||
r = code_points.split("..")
|
||
if len(r) == 1:
|
||
c = int(r[0], base=16)
|
||
all_code_points[c] = val
|
||
elif len(r) == 2:
|
||
begin = int(r[0], base=16)
|
||
end = int(r[1], base=16) + 1
|
||
all_code_points[begin:end] = [val] * (end - begin)
|
||
else:
|
||
raise ValueError
|
||
|
||
# By default every code point has width 1. This is what the C++ standard says,
|
||
# even though the Unicode standard says some code points have width 0.
|
||
all_code_points = [1] * (1 + 0x10FFFF)
|
||
|
||
# Extract all code points with East_Asian_Width=W or East_Asian_Width=F
|
||
for line in open("EastAsianWidth.txt", "r"):
|
||
# Example lines:
|
||
# 3000 ; F
|
||
# 3001..3003 ; W
|
||
line = line.split("#")[0]
|
||
if re.match(r'^[\dA-Fa-f][^;]+;\s*[WF]\s*$', line):
|
||
process_code_points(line.split(";")[0], 2)
|
||
|
||
# The C++ standard also gives width 2 to the following ranges:
|
||
# U+4DC0 – U+4DFF (Yijing Hexagram Symbols)
|
||
process_code_points("4DC0..4DFF", 2)
|
||
# U+1F300 – U+1F5FF (Miscellaneous Symbols and Pictographs)
|
||
process_code_points("1F300..1F5FF", 2)
|
||
# U+1F900 – U+1F9FF (Supplemental Symbols and Pictographs)
|
||
process_code_points("1F900..1F9FF", 2)
|
||
|
||
# Create a list that only contains the code points that have a different width
|
||
# to the previous code point.
|
||
edges = find_edges(all_code_points, 1)
|
||
|
||
# Table for std::__unicode::__format_width(char32_t)
|
||
|
||
print(" // Table generated by contrib/unicode/gen_std_format_width.py,")
|
||
print(" // from EastAsianWidth.txt from the Unicode standard.");
|
||
print(" inline constexpr char32_t __width_edges[] = {", end="")
|
||
for i, e in enumerate(edges):
|
||
if i % 8:
|
||
print(" ", end="")
|
||
else:
|
||
print("\n ", end="")
|
||
c,_ = e
|
||
print("{:#x},".format(c), end="")
|
||
print("\n };\n")
|
||
|
||
# By default every code point has Grapheme_Cluster_Break=Other.
|
||
all_code_points = ["Other"] * (1 + 0x10FFFF)
|
||
|
||
# Extract Grapheme_Cluster_Break property for all code points.
|
||
for line in open("GraphemeBreakProperty.txt", "r"):
|
||
# Example lines:
|
||
# "0600..0605", "Prepend"
|
||
# "00AD", "Control"
|
||
line = line.split("#")[0]
|
||
if re.match(r'^[\dA-Fa-f][^;]+;', line):
|
||
code_points, grapheme_property = line.split(";")
|
||
process_code_points(code_points, grapheme_property.strip())
|
||
|
||
edges = find_edges(all_code_points)
|
||
gcb_props = {"Other":0}
|
||
for c, p in edges:
|
||
if p not in gcb_props:
|
||
gcb_props[p] = len(gcb_props)
|
||
shift_bits = int(math.ceil(math.log2(len(gcb_props))))
|
||
|
||
# Enum definition for std::__unicode::_Gcb_property
|
||
|
||
print(" enum class _Gcb_property {")
|
||
for p in gcb_props.items():
|
||
print(" _Gcb_{} = {},".format(p[0],p[1]))
|
||
print(" };\n")
|
||
|
||
# Tables for std::__unicode::_Grapheme_cluster_state
|
||
|
||
print(" // Values generated by contrib/unicode/gen_std_format_width.py,")
|
||
print(" // from GraphemeBreakProperty.txt from the Unicode standard.");
|
||
print(" // Entries are (code_point << shift_bits) + property.")
|
||
print(" inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits))
|
||
print(" inline constexpr uint32_t __gcb_edges[] = {", end="")
|
||
for i, e in enumerate(edges):
|
||
if i % 6:
|
||
print(" ", end="")
|
||
else:
|
||
print("\n ", end="")
|
||
c, p = e
|
||
x = (c << shift_bits) + gcb_props[p]
|
||
print("{0:#x},".format(x), end="")
|
||
print("\n };\n")
|
||
|
||
# By default every code point has Indic_Conjunct_Break=None.
|
||
all_code_points = [None] * (1 + 0x10FFFF)
|
||
|
||
# Extract Indic_Conjunct_Break property for all code points.
|
||
for line in open("DerivedCoreProperties.txt", "r"):
|
||
# Example lines:
|
||
# 094D ; InCB; Linker
|
||
# 0B71 ; InCB; Consonant
|
||
# 0300..034E ; InCB; Extend
|
||
line = line.split("#")[0]
|
||
if re.match(r'^[\dA-Fa-f][^;]+; InCB;', line):
|
||
code_points, _, incb_property = line.split(";")
|
||
process_code_points(code_points, incb_property.strip())
|
||
|
||
# Table for std::__unicode::__is_incb_linker
|
||
# This table is tiny, so just contains the list of code points.
|
||
print(" inline constexpr char32_t __incb_linkers[] = {\n ", end="")
|
||
for i in [i for i,p in enumerate(all_code_points) if p == "Linker"]:
|
||
print(" 0x{:04x},".format(i), end="")
|
||
all_code_points[i] = None
|
||
print("\n };\n")
|
||
|
||
edges = find_edges(all_code_points)
|
||
|
||
incb_props = {None:0, "Consonant":1, "Extend":2}
|
||
print(" enum class _InCB { _Consonant = 1, _Extend = 2 };\n")
|
||
# Table for std::__unicode::__incb_property
|
||
print(" // Values generated by contrib/unicode/gen_std_format_width.py,")
|
||
print(" // from DerivedCoreProperties.txt from the Unicode standard.");
|
||
print(" // Entries are (code_point << 2) + property.")
|
||
print(" inline constexpr uint32_t __incb_edges[] = {", end="")
|
||
for i, e in enumerate(edges):
|
||
if i % 6:
|
||
print(" ", end="")
|
||
else:
|
||
print("\n ", end="")
|
||
c, p = e
|
||
x = (c << 2) + incb_props[p]
|
||
print("{0:#x},".format(x), end="")
|
||
print("\n };\n")
|
||
|
||
# By default every code point has Emoji=No.
|
||
all_code_points = [False] * (1 + 0x10FFFF)
|
||
|
||
# Extract Emoji=Extended_Pictographic for all code points.
|
||
for line in open("emoji-data.txt", "r"):
|
||
# Example lines:
|
||
# 1100..115F ; Extended_Pictographic
|
||
# 232A ; Extended_Pictographic
|
||
line = line.split("#")[0]
|
||
if re.match(r'^[\dA-Fa-f][^;]+; Extended_Pictographic', line):
|
||
process_code_points(line.split(";")[0], True)
|
||
|
||
edges = find_edges(all_code_points, False)
|
||
|
||
# Table for std::__unicode::__is_extended_pictographic
|
||
print(" // Table generated by contrib/unicode/gen_std_format_width.py,")
|
||
print(" // from emoji-data.txt from the Unicode standard.");
|
||
print(" inline constexpr char32_t __xpicto_edges[] = {", end="")
|
||
for i, e in enumerate(edges):
|
||
if i % 8:
|
||
print(" ", end="")
|
||
else:
|
||
print("\n ", end="")
|
||
c,_ = e
|
||
print("{:#x},".format(c), end="")
|
||
print("\n };\n")
|
||
|
||
# <bits/unicode.h> gives an error if this macro is left defined.
|
||
# Do this last, so that the generated output is not usable unless we reach here.
|
||
print("#undef _GLIBCXX_GET_UNICODE_DATA")
|