gccrs: Add function for Unicode NFC normalization
gcc/rust/ChangeLog: * Make-lang.in: Add rust-unicode.o * rust-lang.cc (run_rust_tests): Add test. * rust-system.h: Include <array> * util/make-rust-unicode.py: Generater of rust-unicode-data.h. * util/rust-unicode-data.h: Auto-generated file. * util/rust-unicode.cc: New file. * util/rust-unicode.h: New file. Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
This commit is contained in:
parent
fa13cfd08a
commit
4d63098451
7 changed files with 5879 additions and 0 deletions
|
@ -181,6 +181,7 @@ GRS_OBJS = \
|
|||
rust/rust-feature.o \
|
||||
rust/rust-feature-gate.o \
|
||||
rust/rust-dir-owner.o \
|
||||
rust/rust-unicode.o \
|
||||
$(END)
|
||||
# removed object files from here
|
||||
|
||||
|
|
|
@ -37,6 +37,7 @@
|
|||
#include "rust-ast-resolve-item.h"
|
||||
#include "rust-lex.h"
|
||||
#include "optional.h"
|
||||
#include "rust-unicode.h"
|
||||
|
||||
#include <mpfr.h>
|
||||
// note: header files must be in this order or else forward declarations don't
|
||||
|
@ -455,6 +456,7 @@ run_rust_tests ()
|
|||
rust_privacy_ctx_test ();
|
||||
rust_crate_name_validation_test ();
|
||||
rust_simple_path_resolve_test ();
|
||||
rust_utf8_normalize_test ();
|
||||
}
|
||||
} // namespace selftest
|
||||
|
||||
|
|
|
@ -43,6 +43,7 @@
|
|||
#include <memory>
|
||||
#include <utility>
|
||||
#include <fstream>
|
||||
#include <array>
|
||||
|
||||
// Rust frontend requires C++11 minimum, so will have unordered_map and set
|
||||
#include <unordered_map>
|
||||
|
|
289
gcc/rust/util/make-rust-unicode.py
Normal file
289
gcc/rust/util/make-rust-unicode.py
Normal file
|
@ -0,0 +1,289 @@
|
|||
# Copyright (C) 2020-2023 Free Software Foundation, Inc.
|
||||
|
||||
# This file is part of GCC.
|
||||
|
||||
# GCC is free software; you can redistribute it and/or modify it under
|
||||
# the terms of the GNU General Public License as published by the Free
|
||||
# Software Foundation; either version 3, or (at your option) any later
|
||||
# version.
|
||||
|
||||
# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
# for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with GCC; see the file COPYING3. If not see
|
||||
# <http://www.gnu.org/licenses/>.
|
||||
|
||||
# Run this program as
|
||||
# python ./make-rust-unicode.py UnicodeData.txt \
|
||||
# DerivedNormalizationProps.txt DerivedCoreProperties.txt \
|
||||
# > rust-unicode-data.h
|
||||
|
||||
import sys
|
||||
|
||||
COPYRIGHT = (
|
||||
"// Copyright (C) 2020-2023 Free Software Foundation, Inc.\n"
|
||||
"\n"
|
||||
"// This file is part of GCC.\n"
|
||||
"\n"
|
||||
"// GCC is free software; you can redistribute it and/or modify it under\n"
|
||||
"// the terms of the GNU General Public License as published by the Free\n"
|
||||
"// Software Foundation; either version 3, or (at your option) any later\n"
|
||||
"// version.\n"
|
||||
"\n"
|
||||
"// GCC is distributed in the hope that it will be useful, but WITHOUT ANY\n"
|
||||
"// WARRANTY; without even the implied warranty of MERCHANTABILITY or\n"
|
||||
"// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License\n"
|
||||
"// for more details.\n"
|
||||
"\n"
|
||||
"// You should have received a copy of the GNU General Public License\n"
|
||||
"// along with GCC; see the file COPYING3. If not see\n"
|
||||
"// <http://www.gnu.org/licenses/>."
|
||||
)
|
||||
|
||||
# Decomposition_Mapping table
|
||||
decomposition_map = {}
|
||||
# Canonical_Combining_Class table
|
||||
ccc_table = {}
|
||||
# Ranges of codepoints with the Full_Composition_Exclusion property
|
||||
composition_exclusion_ranges = []
|
||||
# Ranges of codepoints with the Full_Composition_Exclusion property
|
||||
alphabetic_ranges = []
|
||||
# Ranges of codepoints with NFC_QC=No
|
||||
nfc_qc_no_ranges = []
|
||||
# Ranges of codepoints with NFC_QC=Maybe
|
||||
nfc_qc_maybe_ranges = []
|
||||
numeric_codepoints = []
|
||||
|
||||
# Note that an element of range `[m, n]` (a list in python) represents [m, n)
|
||||
|
||||
|
||||
def binary_search_ranges(ranges, target):
|
||||
low = 0
|
||||
high = len(ranges) - 1
|
||||
while low <= high:
|
||||
mid = (low + high) // 2
|
||||
start, end = ranges[mid]
|
||||
if start <= target <= end - 1:
|
||||
return mid # target found. returns index.
|
||||
elif target < start:
|
||||
high = mid - 1
|
||||
else:
|
||||
low = mid + 1
|
||||
# target not found.
|
||||
return -1
|
||||
|
||||
|
||||
# Utility function to parse '<codepoint>...<codepoint>' or '<codepoint>'
|
||||
def parse_codepoint_range(range_str):
|
||||
codepoint_range = range_str.split("..")
|
||||
assert len(codepoint_range) == 1 or len(codepoint_range) == 2, "Invalid format"
|
||||
start_cp, end_cp = 0, 0
|
||||
if len(codepoint_range) == 1:
|
||||
# m..n => [m, n+1)
|
||||
start_cp = int(codepoint_range[0], 16)
|
||||
end_cp = start_cp + 1
|
||||
else:
|
||||
# m => [m, m+1)
|
||||
start_cp = int(codepoint_range[0], 16)
|
||||
end_cp = int(codepoint_range[1], 16) + 1
|
||||
return [start_cp, end_cp]
|
||||
|
||||
|
||||
def read_unicode_data_txt(filepath):
|
||||
def process_line(line):
|
||||
rows = line.split(";")
|
||||
if len(rows) != 15:
|
||||
return
|
||||
# Parse codepoint
|
||||
cp = int(rows[0], 16)
|
||||
# Parse general category
|
||||
category = rows[2]
|
||||
if category == "Nd" or category == "Nl" or category == "No":
|
||||
numeric_codepoints.append(cp)
|
||||
|
||||
# Parse CCC
|
||||
ccc = int(rows[3], 10)
|
||||
if ccc != 0:
|
||||
ccc_table[cp] = ccc
|
||||
# Parse decomposition mapping
|
||||
# Ignore compatibility decomposition mapping because
|
||||
# it is not required for **NFC** normalization.
|
||||
if not rows[5].startswith("<"):
|
||||
decomp_cp_strs = rows[5].split(" ")
|
||||
decomp_cps = []
|
||||
for s in decomp_cp_strs:
|
||||
if s == "":
|
||||
continue
|
||||
decomp_cps.append(int(s, 16))
|
||||
assert (
|
||||
len(decomp_cps) <= 2
|
||||
), "Decomposition_Mapping must not contain more than 2 characters."
|
||||
if len(decomp_cps) > 0:
|
||||
decomposition_map[cp] = decomp_cps
|
||||
|
||||
with open(sys.argv[1], "r", encoding="UTF-8") as file:
|
||||
while line := file.readline():
|
||||
process_line(line.rstrip())
|
||||
|
||||
|
||||
def read_derived_norm_props_txt(filepath):
|
||||
def process_line(line):
|
||||
# Ignore comments
|
||||
line = line.split("#")[0]
|
||||
rows = line.split(";")
|
||||
# Too few rows. Skipped.
|
||||
if len(rows) < 2:
|
||||
return
|
||||
rows[0] = rows[0].lstrip().rstrip()
|
||||
rows[1] = rows[1].lstrip().rstrip()
|
||||
cp_range = parse_codepoint_range(rows[0])
|
||||
if rows[1] == "Full_Composition_Exclusion":
|
||||
composition_exclusion_ranges.append(cp_range)
|
||||
elif rows[1] == "NFC_QC":
|
||||
assert len(rows) >= 3, "Too few rows for NFC_QC"
|
||||
rows[2] = rows[2].lstrip().rstrip()
|
||||
if rows[2] == "N":
|
||||
nfc_qc_no_ranges.append(cp_range)
|
||||
elif rows[2] == "M":
|
||||
nfc_qc_maybe_ranges.append(cp_range)
|
||||
else:
|
||||
raise RuntimeError("Value of NFC_QC must be N or M")
|
||||
|
||||
with open(filepath, "r", encoding="UTF-8") as file:
|
||||
while line := file.readline():
|
||||
process_line(line.rstrip())
|
||||
|
||||
|
||||
def read_derived_core_props_txt(filepath):
|
||||
def process_line(line):
|
||||
# Ignore comments
|
||||
line = line.split("#")[0]
|
||||
rows = line.split(";")
|
||||
# Too few rows. Skipped.
|
||||
if len(rows) < 2:
|
||||
return
|
||||
rows[0] = rows[0].lstrip().rstrip()
|
||||
rows[1] = rows[1].lstrip().rstrip()
|
||||
if rows[1] != "Alphabetic":
|
||||
return
|
||||
cp_range = parse_codepoint_range(rows[0])
|
||||
alphabetic_ranges.append(cp_range)
|
||||
|
||||
with open(filepath, "r", encoding="UTF-8") as file:
|
||||
while line := file.readline():
|
||||
process_line(line.rstrip())
|
||||
|
||||
|
||||
def write_decomposition():
|
||||
print("const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {")
|
||||
print(" // clang-format off")
|
||||
for cp in sorted(decomposition_map):
|
||||
print(" {{{:#06x}, ".format(cp), end="")
|
||||
print("{", end="")
|
||||
for decomp_cp in decomposition_map[cp]:
|
||||
print("{:#06x}, ".format(decomp_cp), end="")
|
||||
print("}},")
|
||||
print(" // clang-format on")
|
||||
print("};")
|
||||
|
||||
|
||||
def write_recomposition():
|
||||
print(
|
||||
"const std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{"
|
||||
)
|
||||
print(" // clang-format off")
|
||||
for cp in decomposition_map:
|
||||
if binary_search_ranges(composition_exclusion_ranges, cp) != -1:
|
||||
continue
|
||||
if len(decomposition_map[cp]) == 1:
|
||||
d1 = decomposition_map[cp][0]
|
||||
d2 = 0
|
||||
else:
|
||||
d1 = decomposition_map[cp][0]
|
||||
d2 = decomposition_map[cp][1]
|
||||
print(" {{{{{:#06x}, {:#06x}}}, {:#06x}}},".format(d1, d2, cp))
|
||||
print(" // clang-format on")
|
||||
print("}};")
|
||||
|
||||
|
||||
def write_ccc():
|
||||
print("const std::map<uint32_t, int32_t> CCC_TABLE = {")
|
||||
print(" // clang-format off")
|
||||
for cp in ccc_table:
|
||||
print(" {{{:#06x}, {}}},".format(cp, ccc_table[cp]))
|
||||
print(" // clang-format on")
|
||||
print("};")
|
||||
|
||||
|
||||
def write_alphabetic():
|
||||
print(
|
||||
"const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES> ALPHABETIC_RANGES = {{"
|
||||
)
|
||||
print(" // clang-format off")
|
||||
for r in alphabetic_ranges:
|
||||
print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
|
||||
print(" // clang-format on")
|
||||
print("}};")
|
||||
|
||||
|
||||
def write_numeric():
|
||||
print("const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{")
|
||||
print(" // clang-format off")
|
||||
for i, cp in enumerate(numeric_codepoints):
|
||||
if i % 16 == 0:
|
||||
print(" ", end="")
|
||||
print("{:#06x}, ".format(cp), end="")
|
||||
if i % 16 == 15:
|
||||
print()
|
||||
if i % 16 != 15:
|
||||
print()
|
||||
print(" // clang-format on")
|
||||
print("}};")
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 4:
|
||||
print("too few arguments", file=sys.stderr)
|
||||
exit(-1)
|
||||
unicode_txt_path = sys.argv[1]
|
||||
norm_props_txt_path = sys.argv[2]
|
||||
core_props_txt_path = sys.argv[3]
|
||||
|
||||
read_unicode_data_txt(unicode_txt_path)
|
||||
read_derived_norm_props_txt(norm_props_txt_path)
|
||||
read_derived_core_props_txt(core_props_txt_path)
|
||||
|
||||
print(COPYRIGHT)
|
||||
print()
|
||||
|
||||
print('#include "rust-system.h"')
|
||||
print()
|
||||
print("namespace Rust {")
|
||||
print()
|
||||
print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges)))
|
||||
print("const uint32_t NUM_NUMERIC_CODEPOINTS = {};".format(len(numeric_codepoints)))
|
||||
print()
|
||||
|
||||
write_decomposition()
|
||||
print()
|
||||
write_recomposition()
|
||||
print()
|
||||
# write_composition_exclusion()
|
||||
# print()
|
||||
write_ccc()
|
||||
print()
|
||||
write_alphabetic()
|
||||
print()
|
||||
write_numeric()
|
||||
print()
|
||||
|
||||
# TODO: write NFC_QC table
|
||||
|
||||
print("} // namespace Rust")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
5208
gcc/rust/util/rust-unicode-data.h
Normal file
5208
gcc/rust/util/rust-unicode-data.h
Normal file
File diff suppressed because it is too large
Load diff
328
gcc/rust/util/rust-unicode.cc
Normal file
328
gcc/rust/util/rust-unicode.cc
Normal file
|
@ -0,0 +1,328 @@
|
|||
#include "rust-system.h"
|
||||
#include "optional.h"
|
||||
#include "selftest.h"
|
||||
|
||||
#include "rust-unicode-data.h"
|
||||
|
||||
namespace Rust {
|
||||
|
||||
typedef uint32_t codepoint_t;
|
||||
typedef std::vector<codepoint_t> string_t;
|
||||
|
||||
template <std::size_t SIZE>
|
||||
int64_t
|
||||
binary_search_ranges (
|
||||
const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
|
||||
uint32_t target_cp)
|
||||
{
|
||||
if (SIZE == 0)
|
||||
return -1;
|
||||
|
||||
uint32_t low, high, mid;
|
||||
uint32_t start, end;
|
||||
low = 0;
|
||||
high = SIZE - 1;
|
||||
mid = (low + high) / 2;
|
||||
while (high - low > 1)
|
||||
{
|
||||
start = ranges[mid].first;
|
||||
end = ranges[mid].second;
|
||||
if (start <= target_cp && target_cp < end)
|
||||
{
|
||||
return mid;
|
||||
}
|
||||
else if (end <= target_cp)
|
||||
low = mid + 1;
|
||||
else
|
||||
high = mid - 1;
|
||||
mid = (low + high) / 2;
|
||||
}
|
||||
|
||||
if (ranges[mid].first <= target_cp && target_cp < ranges[mid].second)
|
||||
return mid;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
template <std::size_t SIZE>
|
||||
int64_t
|
||||
binary_search_sorted_array (const std::array<uint32_t, SIZE> &array,
|
||||
uint32_t target)
|
||||
{
|
||||
if (SIZE == 0)
|
||||
return -1;
|
||||
|
||||
uint32_t low, high, mid;
|
||||
low = 0;
|
||||
high = SIZE;
|
||||
mid = (low + high) / 2;
|
||||
while (high - low > 1)
|
||||
{
|
||||
if (array[mid] <= target)
|
||||
low = mid;
|
||||
else
|
||||
high = mid;
|
||||
mid = (low + high) / 2;
|
||||
}
|
||||
|
||||
if (array[mid] == target)
|
||||
return mid;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
int
|
||||
lookup_cc (codepoint_t c)
|
||||
{
|
||||
auto it = Rust::CCC_TABLE.find (c);
|
||||
if (it != Rust::CCC_TABLE.end ())
|
||||
return it->second;
|
||||
else
|
||||
// Starter. Returns zero.
|
||||
return 0;
|
||||
}
|
||||
|
||||
tl::optional<codepoint_t>
|
||||
lookup_recomp (codepoint_t starter, codepoint_t c)
|
||||
{
|
||||
auto it = Rust::RECOMPOSITION_MAP.find ({starter, c});
|
||||
if (it != Rust::RECOMPOSITION_MAP.end ())
|
||||
return {it->second};
|
||||
|
||||
it = Rust::RECOMPOSITION_MAP.find ({starter, 0});
|
||||
if (it != Rust::RECOMPOSITION_MAP.end ())
|
||||
return {it->second};
|
||||
|
||||
return tl::nullopt;
|
||||
}
|
||||
|
||||
void
|
||||
recursive_decomp_cano (codepoint_t c, string_t &buf)
|
||||
{
|
||||
auto it = Rust::DECOMPOSITION_MAP.find (c);
|
||||
if (it != Rust::DECOMPOSITION_MAP.end ())
|
||||
{
|
||||
string_t decomped = it->second;
|
||||
for (codepoint_t cp : decomped)
|
||||
{
|
||||
recursive_decomp_cano (cp, buf);
|
||||
}
|
||||
}
|
||||
else
|
||||
buf.push_back (c);
|
||||
}
|
||||
|
||||
string_t
|
||||
decomp_cano (string_t s)
|
||||
{
|
||||
// TODO: Algorithmic lookup for Hangul
|
||||
string_t buf;
|
||||
for (codepoint_t c : s)
|
||||
recursive_decomp_cano (c, buf);
|
||||
return buf;
|
||||
}
|
||||
|
||||
void
|
||||
sort_cano (string_t &s)
|
||||
{
|
||||
int cc_here, cc_prev;
|
||||
if (s.size () == 1)
|
||||
return;
|
||||
for (unsigned int i = 1; i < s.size (); i++)
|
||||
{
|
||||
cc_here = lookup_cc (s[i]);
|
||||
cc_prev = lookup_cc (s[i - 1]);
|
||||
if (cc_here >= 0 && cc_prev > cc_here)
|
||||
{
|
||||
// swap
|
||||
int tmp = s[i];
|
||||
s[i] = s[i - 1];
|
||||
s[i - 1] = tmp;
|
||||
if (i > 1)
|
||||
i -= 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
string_t
|
||||
recomp (string_t s)
|
||||
{
|
||||
// TODO: Algorithmic lookup for Hangul
|
||||
string_t buf;
|
||||
if (s.size () > 0)
|
||||
{
|
||||
int last_class = -1;
|
||||
// int starter_pos = 0; // Assume the first character is Starter. Correct?
|
||||
// int target_pos = 1;
|
||||
codepoint_t starter_ch = s[0];
|
||||
for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
|
||||
{
|
||||
// get current character
|
||||
codepoint_t ch = s[src_pos];
|
||||
int ch_class = lookup_cc (ch);
|
||||
tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
|
||||
if (composite.has_value () && last_class < ch_class)
|
||||
{
|
||||
// ch can be composed
|
||||
buf.push_back (composite.value ());
|
||||
starter_ch = composite.value ();
|
||||
}
|
||||
else if (ch_class == 0)
|
||||
{
|
||||
// ch is Starter and cannot be composed.
|
||||
if (src_pos == 1)
|
||||
// FIXME: buggy?
|
||||
buf.push_back (starter_ch);
|
||||
// starter_pos = target_pos;
|
||||
starter_ch = ch;
|
||||
last_class = -1;
|
||||
buf.push_back (ch);
|
||||
}
|
||||
else
|
||||
{
|
||||
// ch is not Starter.
|
||||
last_class = ch_class;
|
||||
buf.push_back (ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
// TODO: remove
|
||||
/*
|
||||
void
|
||||
dump_string (std::vector<uint32_t> s)
|
||||
{
|
||||
std::cout << "dump=";
|
||||
for (auto c : s)
|
||||
{
|
||||
std::cout << std::hex << c << ", ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
*/
|
||||
|
||||
string_t
|
||||
nfc_normalize (string_t s)
|
||||
{
|
||||
// TODO: Quick Check
|
||||
|
||||
// decompose
|
||||
string_t d = decomp_cano (s);
|
||||
sort_cano (d);
|
||||
|
||||
// recompose
|
||||
string_t r = recomp (d);
|
||||
return r;
|
||||
}
|
||||
|
||||
bool
|
||||
is_alphabetic (uint32_t codepoint)
|
||||
{
|
||||
int64_t res = binary_search_ranges (ALPHABETIC_RANGES, codepoint);
|
||||
if (res < 0)
|
||||
return false;
|
||||
else
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
is_numeric (uint32_t codepoint)
|
||||
{
|
||||
int64_t res = binary_search_sorted_array (NUMERIC_CODEPOINTS, codepoint);
|
||||
if (res < 0)
|
||||
return false;
|
||||
else
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace Rust
|
||||
|
||||
namespace selftest {
|
||||
|
||||
void
|
||||
assert_normalize (std::vector<uint32_t> origin, std::vector<uint32_t> expected)
|
||||
{
|
||||
std::vector<uint32_t> actual = Rust::nfc_normalize (origin);
|
||||
|
||||
ASSERT_EQ (actual.size (), expected.size ());
|
||||
for (unsigned int i = 0; i < actual.size (); i++)
|
||||
{
|
||||
ASSERT_EQ (actual[i], expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
rust_utf8_normalize_test ()
|
||||
{
|
||||
// ASCII
|
||||
assert_normalize ({'h', 'e', 'l', 'l', 'o'}, {'h', 'e', 'l', 'l', 'o'});
|
||||
// ASCII
|
||||
assert_normalize ({'/', '\\', '.', ':', '*'}, {'/', '\\', '.', ':', '*'});
|
||||
|
||||
// testcases retrieved from Part0 of
|
||||
// https://unicode.org/Public/UNIDATA/NormalizationTest.txt
|
||||
assert_normalize ({0x1e0a}, {0x1e0a});
|
||||
assert_normalize ({0x1e0c}, {0x1e0c});
|
||||
assert_normalize ({0x1e0a, 0x0323}, {0x1e0c, 0x0307});
|
||||
assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307});
|
||||
assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307});
|
||||
|
||||
// TODO: add more testcases in
|
||||
// https://unicode.org/Public/UNIDATA/NormalizationTest.txt
|
||||
}
|
||||
|
||||
void
|
||||
rust_utf8_property_test ()
|
||||
{
|
||||
ASSERT_TRUE (Rust::is_alphabetic ('A'));
|
||||
ASSERT_TRUE (Rust::is_alphabetic ('B'));
|
||||
ASSERT_TRUE (Rust::is_alphabetic ('x'));
|
||||
ASSERT_TRUE (Rust::is_alphabetic ('z'));
|
||||
ASSERT_TRUE (Rust::is_alphabetic (0x00b5)); // µ
|
||||
ASSERT_TRUE (Rust::is_alphabetic (0x3093)); // ん
|
||||
ASSERT_TRUE (Rust::is_alphabetic (0xa8f2)); // ꣲ
|
||||
ASSERT_TRUE (Rust::is_alphabetic (0x2b743)); // 𫝃
|
||||
|
||||
ASSERT_FALSE (Rust::is_alphabetic ('\v'));
|
||||
ASSERT_FALSE (Rust::is_alphabetic ('-'));
|
||||
ASSERT_FALSE (Rust::is_alphabetic ('_'));
|
||||
ASSERT_FALSE (Rust::is_alphabetic ('+'));
|
||||
ASSERT_FALSE (Rust::is_alphabetic ('0'));
|
||||
ASSERT_FALSE (Rust::is_alphabetic ('1'));
|
||||
ASSERT_FALSE (Rust::is_alphabetic ('2'));
|
||||
ASSERT_FALSE (Rust::is_alphabetic ('9'));
|
||||
ASSERT_FALSE (Rust::is_alphabetic (0xa720)); // ◌
|
||||
ASSERT_FALSE (Rust::is_alphabetic (0xaac1)); // ◌꫁
|
||||
|
||||
// `Nd`s
|
||||
ASSERT_TRUE (Rust::is_numeric ('0'));
|
||||
ASSERT_TRUE (Rust::is_numeric ('1'));
|
||||
ASSERT_TRUE (Rust::is_numeric ('7'));
|
||||
ASSERT_TRUE (Rust::is_numeric ('9'));
|
||||
ASSERT_TRUE (Rust::is_numeric (0x07c2)); // ߂
|
||||
ASSERT_TRUE (Rust::is_numeric (0x096d)); // ७
|
||||
// `Nl`s
|
||||
ASSERT_TRUE (Rust::is_numeric (0x16e6)); // ᛮ
|
||||
ASSERT_TRUE (Rust::is_numeric (0xa6e6)); // ꛦ
|
||||
ASSERT_TRUE (Rust::is_numeric (0x12400)); // 𒐀
|
||||
ASSERT_TRUE (Rust::is_numeric (0x1243a)); // 𒐺
|
||||
// `No`s
|
||||
ASSERT_TRUE (Rust::is_numeric (0x00b2)); // ²
|
||||
ASSERT_TRUE (Rust::is_numeric (0x32b1)); // ㊱
|
||||
|
||||
ASSERT_FALSE (Rust::is_numeric ('\n'));
|
||||
ASSERT_FALSE (Rust::is_numeric ('-'));
|
||||
ASSERT_FALSE (Rust::is_numeric ('_'));
|
||||
ASSERT_FALSE (Rust::is_numeric ('('));
|
||||
ASSERT_FALSE (Rust::is_numeric ('z'));
|
||||
ASSERT_FALSE (Rust::is_numeric (';'));
|
||||
ASSERT_FALSE (Rust::is_numeric (0x03f4)); // ϴ
|
||||
ASSERT_FALSE (Rust::is_numeric (0x0628)); // ب
|
||||
ASSERT_FALSE (Rust::is_numeric (0x0975)); // ॵ
|
||||
ASSERT_FALSE (Rust::is_numeric (0x18f0)); // ᣰ
|
||||
ASSERT_FALSE (Rust::is_numeric (0x2f30)); // ⼰
|
||||
}
|
||||
|
||||
} // namespace selftest
|
50
gcc/rust/util/rust-unicode.h
Normal file
50
gcc/rust/util/rust-unicode.h
Normal file
|
@ -0,0 +1,50 @@
|
|||
// Copyright (C) 2020-2023 Free Software Foundation, Inc.
|
||||
|
||||
// This file is part of GCC.
|
||||
|
||||
// GCC is free software; you can redistribute it and/or modify it under
|
||||
// the terms of the GNU General Public License as published by the Free
|
||||
// Software Foundation; either version 3, or (at your option) any later
|
||||
// version.
|
||||
|
||||
// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
// for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with GCC; see the file COPYING3. If not see
|
||||
// <http://www.gnu.org/licenses/>.
|
||||
|
||||
#ifndef RUST_UNICODE_H
|
||||
#define RUST_UNICODE_H
|
||||
|
||||
#include "rust-system.h"
|
||||
|
||||
namespace Rust {
|
||||
|
||||
// TODO: add function nfc_normalize
|
||||
|
||||
bool
|
||||
is_alphabetic (uint32_t codepoint);
|
||||
|
||||
bool
|
||||
is_numeric (uint32_t codepoint);
|
||||
|
||||
} // namespace Rust
|
||||
|
||||
#if CHECKING_P
|
||||
|
||||
namespace selftest {
|
||||
|
||||
void
|
||||
rust_utf8_normalize_test ();
|
||||
|
||||
void
|
||||
rust_utf8_property_test ();
|
||||
|
||||
} // namespace selftest
|
||||
|
||||
#endif // CHECKING_P
|
||||
|
||||
#endif
|
Loading…
Add table
Reference in a new issue