gccrs: Add function for Unicode NFC normalization

gcc/rust/ChangeLog:

	* Make-lang.in: Add rust-unicode.o
	* rust-lang.cc (run_rust_tests): Add test.
	* rust-system.h: Include <array>
	* util/make-rust-unicode.py: Generater of rust-unicode-data.h.
	* util/rust-unicode-data.h: Auto-generated file.
	* util/rust-unicode.cc: New file.
	* util/rust-unicode.h: New file.

Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
This commit is contained in:
Raiki Tamura 2023-07-14 14:45:34 +09:00 committed by Arthur Cohen
parent fa13cfd08a
commit 4d63098451
7 changed files with 5879 additions and 0 deletions

View file

@ -181,6 +181,7 @@ GRS_OBJS = \
rust/rust-feature.o \
rust/rust-feature-gate.o \
rust/rust-dir-owner.o \
rust/rust-unicode.o \
$(END)
# removed object files from here

View file

@ -37,6 +37,7 @@
#include "rust-ast-resolve-item.h"
#include "rust-lex.h"
#include "optional.h"
#include "rust-unicode.h"
#include <mpfr.h>
// note: header files must be in this order or else forward declarations don't
@ -455,6 +456,7 @@ run_rust_tests ()
rust_privacy_ctx_test ();
rust_crate_name_validation_test ();
rust_simple_path_resolve_test ();
rust_utf8_normalize_test ();
}
} // namespace selftest

View file

@ -43,6 +43,7 @@
#include <memory>
#include <utility>
#include <fstream>
#include <array>
// Rust frontend requires C++11 minimum, so will have unordered_map and set
#include <unordered_map>

View file

@ -0,0 +1,289 @@
# Copyright (C) 2020-2023 Free Software Foundation, Inc.
# This file is part of GCC.
# GCC is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later
# version.
# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.
# You should have received a copy of the GNU General Public License
# along with GCC; see the file COPYING3. If not see
# <http://www.gnu.org/licenses/>.
# Run this program as
# python ./make-rust-unicode.py UnicodeData.txt \
# DerivedNormalizationProps.txt DerivedCoreProperties.txt \
# > rust-unicode-data.h
import sys
COPYRIGHT = (
"// Copyright (C) 2020-2023 Free Software Foundation, Inc.\n"
"\n"
"// This file is part of GCC.\n"
"\n"
"// GCC is free software; you can redistribute it and/or modify it under\n"
"// the terms of the GNU General Public License as published by the Free\n"
"// Software Foundation; either version 3, or (at your option) any later\n"
"// version.\n"
"\n"
"// GCC is distributed in the hope that it will be useful, but WITHOUT ANY\n"
"// WARRANTY; without even the implied warranty of MERCHANTABILITY or\n"
"// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License\n"
"// for more details.\n"
"\n"
"// You should have received a copy of the GNU General Public License\n"
"// along with GCC; see the file COPYING3. If not see\n"
"// <http://www.gnu.org/licenses/>."
)
# Decomposition_Mapping table
decomposition_map = {}
# Canonical_Combining_Class table
ccc_table = {}
# Ranges of codepoints with the Full_Composition_Exclusion property
composition_exclusion_ranges = []
# Ranges of codepoints with the Full_Composition_Exclusion property
alphabetic_ranges = []
# Ranges of codepoints with NFC_QC=No
nfc_qc_no_ranges = []
# Ranges of codepoints with NFC_QC=Maybe
nfc_qc_maybe_ranges = []
numeric_codepoints = []
# Note that an element of range `[m, n]` (a list in python) represents [m, n)
def binary_search_ranges(ranges, target):
low = 0
high = len(ranges) - 1
while low <= high:
mid = (low + high) // 2
start, end = ranges[mid]
if start <= target <= end - 1:
return mid # target found. returns index.
elif target < start:
high = mid - 1
else:
low = mid + 1
# target not found.
return -1
# Utility function to parse '<codepoint>...<codepoint>' or '<codepoint>'
def parse_codepoint_range(range_str):
codepoint_range = range_str.split("..")
assert len(codepoint_range) == 1 or len(codepoint_range) == 2, "Invalid format"
start_cp, end_cp = 0, 0
if len(codepoint_range) == 1:
# m..n => [m, n+1)
start_cp = int(codepoint_range[0], 16)
end_cp = start_cp + 1
else:
# m => [m, m+1)
start_cp = int(codepoint_range[0], 16)
end_cp = int(codepoint_range[1], 16) + 1
return [start_cp, end_cp]
def read_unicode_data_txt(filepath):
def process_line(line):
rows = line.split(";")
if len(rows) != 15:
return
# Parse codepoint
cp = int(rows[0], 16)
# Parse general category
category = rows[2]
if category == "Nd" or category == "Nl" or category == "No":
numeric_codepoints.append(cp)
# Parse CCC
ccc = int(rows[3], 10)
if ccc != 0:
ccc_table[cp] = ccc
# Parse decomposition mapping
# Ignore compatibility decomposition mapping because
# it is not required for **NFC** normalization.
if not rows[5].startswith("<"):
decomp_cp_strs = rows[5].split(" ")
decomp_cps = []
for s in decomp_cp_strs:
if s == "":
continue
decomp_cps.append(int(s, 16))
assert (
len(decomp_cps) <= 2
), "Decomposition_Mapping must not contain more than 2 characters."
if len(decomp_cps) > 0:
decomposition_map[cp] = decomp_cps
with open(sys.argv[1], "r", encoding="UTF-8") as file:
while line := file.readline():
process_line(line.rstrip())
def read_derived_norm_props_txt(filepath):
def process_line(line):
# Ignore comments
line = line.split("#")[0]
rows = line.split(";")
# Too few rows. Skipped.
if len(rows) < 2:
return
rows[0] = rows[0].lstrip().rstrip()
rows[1] = rows[1].lstrip().rstrip()
cp_range = parse_codepoint_range(rows[0])
if rows[1] == "Full_Composition_Exclusion":
composition_exclusion_ranges.append(cp_range)
elif rows[1] == "NFC_QC":
assert len(rows) >= 3, "Too few rows for NFC_QC"
rows[2] = rows[2].lstrip().rstrip()
if rows[2] == "N":
nfc_qc_no_ranges.append(cp_range)
elif rows[2] == "M":
nfc_qc_maybe_ranges.append(cp_range)
else:
raise RuntimeError("Value of NFC_QC must be N or M")
with open(filepath, "r", encoding="UTF-8") as file:
while line := file.readline():
process_line(line.rstrip())
def read_derived_core_props_txt(filepath):
def process_line(line):
# Ignore comments
line = line.split("#")[0]
rows = line.split(";")
# Too few rows. Skipped.
if len(rows) < 2:
return
rows[0] = rows[0].lstrip().rstrip()
rows[1] = rows[1].lstrip().rstrip()
if rows[1] != "Alphabetic":
return
cp_range = parse_codepoint_range(rows[0])
alphabetic_ranges.append(cp_range)
with open(filepath, "r", encoding="UTF-8") as file:
while line := file.readline():
process_line(line.rstrip())
def write_decomposition():
print("const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {")
print(" // clang-format off")
for cp in sorted(decomposition_map):
print(" {{{:#06x}, ".format(cp), end="")
print("{", end="")
for decomp_cp in decomposition_map[cp]:
print("{:#06x}, ".format(decomp_cp), end="")
print("}},")
print(" // clang-format on")
print("};")
def write_recomposition():
print(
"const std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{"
)
print(" // clang-format off")
for cp in decomposition_map:
if binary_search_ranges(composition_exclusion_ranges, cp) != -1:
continue
if len(decomposition_map[cp]) == 1:
d1 = decomposition_map[cp][0]
d2 = 0
else:
d1 = decomposition_map[cp][0]
d2 = decomposition_map[cp][1]
print(" {{{{{:#06x}, {:#06x}}}, {:#06x}}},".format(d1, d2, cp))
print(" // clang-format on")
print("}};")
def write_ccc():
print("const std::map<uint32_t, int32_t> CCC_TABLE = {")
print(" // clang-format off")
for cp in ccc_table:
print(" {{{:#06x}, {}}},".format(cp, ccc_table[cp]))
print(" // clang-format on")
print("};")
def write_alphabetic():
print(
"const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES> ALPHABETIC_RANGES = {{"
)
print(" // clang-format off")
for r in alphabetic_ranges:
print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
print(" // clang-format on")
print("}};")
def write_numeric():
print("const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{")
print(" // clang-format off")
for i, cp in enumerate(numeric_codepoints):
if i % 16 == 0:
print(" ", end="")
print("{:#06x}, ".format(cp), end="")
if i % 16 == 15:
print()
if i % 16 != 15:
print()
print(" // clang-format on")
print("}};")
def main():
if len(sys.argv) != 4:
print("too few arguments", file=sys.stderr)
exit(-1)
unicode_txt_path = sys.argv[1]
norm_props_txt_path = sys.argv[2]
core_props_txt_path = sys.argv[3]
read_unicode_data_txt(unicode_txt_path)
read_derived_norm_props_txt(norm_props_txt_path)
read_derived_core_props_txt(core_props_txt_path)
print(COPYRIGHT)
print()
print('#include "rust-system.h"')
print()
print("namespace Rust {")
print()
print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges)))
print("const uint32_t NUM_NUMERIC_CODEPOINTS = {};".format(len(numeric_codepoints)))
print()
write_decomposition()
print()
write_recomposition()
print()
# write_composition_exclusion()
# print()
write_ccc()
print()
write_alphabetic()
print()
write_numeric()
print()
# TODO: write NFC_QC table
print("} // namespace Rust")
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,328 @@
#include "rust-system.h"
#include "optional.h"
#include "selftest.h"
#include "rust-unicode-data.h"
namespace Rust {
typedef uint32_t codepoint_t;
typedef std::vector<codepoint_t> string_t;
template <std::size_t SIZE>
int64_t
binary_search_ranges (
const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
uint32_t target_cp)
{
if (SIZE == 0)
return -1;
uint32_t low, high, mid;
uint32_t start, end;
low = 0;
high = SIZE - 1;
mid = (low + high) / 2;
while (high - low > 1)
{
start = ranges[mid].first;
end = ranges[mid].second;
if (start <= target_cp && target_cp < end)
{
return mid;
}
else if (end <= target_cp)
low = mid + 1;
else
high = mid - 1;
mid = (low + high) / 2;
}
if (ranges[mid].first <= target_cp && target_cp < ranges[mid].second)
return mid;
else
return -1;
}
template <std::size_t SIZE>
int64_t
binary_search_sorted_array (const std::array<uint32_t, SIZE> &array,
uint32_t target)
{
if (SIZE == 0)
return -1;
uint32_t low, high, mid;
low = 0;
high = SIZE;
mid = (low + high) / 2;
while (high - low > 1)
{
if (array[mid] <= target)
low = mid;
else
high = mid;
mid = (low + high) / 2;
}
if (array[mid] == target)
return mid;
else
return -1;
}
int
lookup_cc (codepoint_t c)
{
auto it = Rust::CCC_TABLE.find (c);
if (it != Rust::CCC_TABLE.end ())
return it->second;
else
// Starter. Returns zero.
return 0;
}
tl::optional<codepoint_t>
lookup_recomp (codepoint_t starter, codepoint_t c)
{
auto it = Rust::RECOMPOSITION_MAP.find ({starter, c});
if (it != Rust::RECOMPOSITION_MAP.end ())
return {it->second};
it = Rust::RECOMPOSITION_MAP.find ({starter, 0});
if (it != Rust::RECOMPOSITION_MAP.end ())
return {it->second};
return tl::nullopt;
}
void
recursive_decomp_cano (codepoint_t c, string_t &buf)
{
auto it = Rust::DECOMPOSITION_MAP.find (c);
if (it != Rust::DECOMPOSITION_MAP.end ())
{
string_t decomped = it->second;
for (codepoint_t cp : decomped)
{
recursive_decomp_cano (cp, buf);
}
}
else
buf.push_back (c);
}
string_t
decomp_cano (string_t s)
{
// TODO: Algorithmic lookup for Hangul
string_t buf;
for (codepoint_t c : s)
recursive_decomp_cano (c, buf);
return buf;
}
void
sort_cano (string_t &s)
{
int cc_here, cc_prev;
if (s.size () == 1)
return;
for (unsigned int i = 1; i < s.size (); i++)
{
cc_here = lookup_cc (s[i]);
cc_prev = lookup_cc (s[i - 1]);
if (cc_here >= 0 && cc_prev > cc_here)
{
// swap
int tmp = s[i];
s[i] = s[i - 1];
s[i - 1] = tmp;
if (i > 1)
i -= 2;
}
}
}
string_t
recomp (string_t s)
{
// TODO: Algorithmic lookup for Hangul
string_t buf;
if (s.size () > 0)
{
int last_class = -1;
// int starter_pos = 0; // Assume the first character is Starter. Correct?
// int target_pos = 1;
codepoint_t starter_ch = s[0];
for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
{
// get current character
codepoint_t ch = s[src_pos];
int ch_class = lookup_cc (ch);
tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
if (composite.has_value () && last_class < ch_class)
{
// ch can be composed
buf.push_back (composite.value ());
starter_ch = composite.value ();
}
else if (ch_class == 0)
{
// ch is Starter and cannot be composed.
if (src_pos == 1)
// FIXME: buggy?
buf.push_back (starter_ch);
// starter_pos = target_pos;
starter_ch = ch;
last_class = -1;
buf.push_back (ch);
}
else
{
// ch is not Starter.
last_class = ch_class;
buf.push_back (ch);
}
}
}
return buf;
}
// TODO: remove
/*
void
dump_string (std::vector<uint32_t> s)
{
std::cout << "dump=";
for (auto c : s)
{
std::cout << std::hex << c << ", ";
}
std::cout << std::endl;
}
*/
string_t
nfc_normalize (string_t s)
{
// TODO: Quick Check
// decompose
string_t d = decomp_cano (s);
sort_cano (d);
// recompose
string_t r = recomp (d);
return r;
}
bool
is_alphabetic (uint32_t codepoint)
{
int64_t res = binary_search_ranges (ALPHABETIC_RANGES, codepoint);
if (res < 0)
return false;
else
return true;
}
bool
is_numeric (uint32_t codepoint)
{
int64_t res = binary_search_sorted_array (NUMERIC_CODEPOINTS, codepoint);
if (res < 0)
return false;
else
return true;
}
} // namespace Rust
namespace selftest {
void
assert_normalize (std::vector<uint32_t> origin, std::vector<uint32_t> expected)
{
std::vector<uint32_t> actual = Rust::nfc_normalize (origin);
ASSERT_EQ (actual.size (), expected.size ());
for (unsigned int i = 0; i < actual.size (); i++)
{
ASSERT_EQ (actual[i], expected[i]);
}
}
void
rust_utf8_normalize_test ()
{
// ASCII
assert_normalize ({'h', 'e', 'l', 'l', 'o'}, {'h', 'e', 'l', 'l', 'o'});
// ASCII
assert_normalize ({'/', '\\', '.', ':', '*'}, {'/', '\\', '.', ':', '*'});
// testcases retrieved from Part0 of
// https://unicode.org/Public/UNIDATA/NormalizationTest.txt
assert_normalize ({0x1e0a}, {0x1e0a});
assert_normalize ({0x1e0c}, {0x1e0c});
assert_normalize ({0x1e0a, 0x0323}, {0x1e0c, 0x0307});
assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307});
assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307});
// TODO: add more testcases in
// https://unicode.org/Public/UNIDATA/NormalizationTest.txt
}
void
rust_utf8_property_test ()
{
ASSERT_TRUE (Rust::is_alphabetic ('A'));
ASSERT_TRUE (Rust::is_alphabetic ('B'));
ASSERT_TRUE (Rust::is_alphabetic ('x'));
ASSERT_TRUE (Rust::is_alphabetic ('z'));
ASSERT_TRUE (Rust::is_alphabetic (0x00b5)); // µ
ASSERT_TRUE (Rust::is_alphabetic (0x3093)); // ん
ASSERT_TRUE (Rust::is_alphabetic (0xa8f2)); // ꣲ
ASSERT_TRUE (Rust::is_alphabetic (0x2b743)); // 𫝃
ASSERT_FALSE (Rust::is_alphabetic ('\v'));
ASSERT_FALSE (Rust::is_alphabetic ('-'));
ASSERT_FALSE (Rust::is_alphabetic ('_'));
ASSERT_FALSE (Rust::is_alphabetic ('+'));
ASSERT_FALSE (Rust::is_alphabetic ('0'));
ASSERT_FALSE (Rust::is_alphabetic ('1'));
ASSERT_FALSE (Rust::is_alphabetic ('2'));
ASSERT_FALSE (Rust::is_alphabetic ('9'));
ASSERT_FALSE (Rust::is_alphabetic (0xa720)); // ◌
ASSERT_FALSE (Rust::is_alphabetic (0xaac1)); // ◌꫁
// `Nd`s
ASSERT_TRUE (Rust::is_numeric ('0'));
ASSERT_TRUE (Rust::is_numeric ('1'));
ASSERT_TRUE (Rust::is_numeric ('7'));
ASSERT_TRUE (Rust::is_numeric ('9'));
ASSERT_TRUE (Rust::is_numeric (0x07c2)); // ߂
ASSERT_TRUE (Rust::is_numeric (0x096d)); // ७
// `Nl`s
ASSERT_TRUE (Rust::is_numeric (0x16e6)); // ᛮ
ASSERT_TRUE (Rust::is_numeric (0xa6e6)); // ꛦ
ASSERT_TRUE (Rust::is_numeric (0x12400)); // 𒐀
ASSERT_TRUE (Rust::is_numeric (0x1243a)); // 𒐺
// `No`s
ASSERT_TRUE (Rust::is_numeric (0x00b2)); // ²
ASSERT_TRUE (Rust::is_numeric (0x32b1)); // ㊱
ASSERT_FALSE (Rust::is_numeric ('\n'));
ASSERT_FALSE (Rust::is_numeric ('-'));
ASSERT_FALSE (Rust::is_numeric ('_'));
ASSERT_FALSE (Rust::is_numeric ('('));
ASSERT_FALSE (Rust::is_numeric ('z'));
ASSERT_FALSE (Rust::is_numeric (';'));
ASSERT_FALSE (Rust::is_numeric (0x03f4)); // ϴ
ASSERT_FALSE (Rust::is_numeric (0x0628)); // ب
ASSERT_FALSE (Rust::is_numeric (0x0975)); // ॵ
ASSERT_FALSE (Rust::is_numeric (0x18f0)); // ᣰ
ASSERT_FALSE (Rust::is_numeric (0x2f30)); // ⼰
}
} // namespace selftest

View file

@ -0,0 +1,50 @@
// Copyright (C) 2020-2023 Free Software Foundation, Inc.
// This file is part of GCC.
// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.
// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
#ifndef RUST_UNICODE_H
#define RUST_UNICODE_H
#include "rust-system.h"
namespace Rust {
// TODO: add function nfc_normalize
bool
is_alphabetic (uint32_t codepoint);
bool
is_numeric (uint32_t codepoint);
} // namespace Rust
#if CHECKING_P
namespace selftest {
void
rust_utf8_normalize_test ();
void
rust_utf8_property_test ();
} // namespace selftest
#endif // CHECKING_P
#endif