gccrs: Add Unicode check for crate_name attributes

gcc/rust/ChangeLog: * lex/rust-codepoint.h: Add comment * lex/rust-lex.h: New method to get decoded characters * rust-session-manager.cc (validate_crate_name): Modify unicode check (rust_crate_name_validation_test): Add testcases * util/rust-unicode.h (RUST_UNICODE_H): New class Utf8String. (class Utf8String): New class. * util/rust-unicode.cc (binary_search_sorted_array): Add comment. (recursive_decomp_cano): Add comment. (recomp): Remove dead code. (dump_string): Removed. gcc/testsuite/ChangeLog: * rust/compile/bad-crate-name.rs: Moved to... * rust/compile/bad-crate-name1.rs: ...here. * rust/compile/bad-crate-name2.rs: New test. Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
2023-07-14 14:45:34 +09:00 · 2023-07-14 14:45:34 +09:00 · f7b2e17682
commit f7b2e17682
parent 884dec3a42
7 changed files with 59 additions and 29 deletions
--- a/gcc/rust/lex/rust-codepoint.h
+++ b/gcc/rust/lex/rust-codepoint.h
@ -22,6 +22,8 @@
 #include "rust-system.h"

 namespace Rust {
+
+// FIXME: move this to rust-unicode.h?
 struct Codepoint
 {
  uint32_t value;
--- a/gcc/rust/lex/rust-lex.h
+++ b/gcc/rust/lex/rust-lex.h
@ -334,6 +334,14 @@ public:
 	  return c;
 	}
    }
+
+    tl::optional<std::vector<Codepoint>> get_chars ()
+    {
+      if (is_valid ())
+	return {chars};
+      else
+	return tl::nullopt;
+    }
  };

  class FileInputSource : public InputSource
--- a/gcc/rust/rust-session-manager.cc
+++ b/gcc/rust/rust-session-manager.cc
@ -42,6 +42,7 @@
 #include "rust-early-name-resolver.h"
 #include "rust-cfg-strip.h"
 #include "rust-expand-visitor.h"
+#include "rust-unicode.h"

 #include "diagnostic.h"
 #include "input.h"
@ -107,30 +108,39 @@ infer_crate_name (const std::string &filename)
  return crate;
 }

-/* Validate the crate name using the ASCII rules
-   TODO: Support Unicode version of the rules */
+/* Validate the crate name using the ASCII rules */

 static bool
 validate_crate_name (const std::string &crate_name, Error &error)
 {
-  if (crate_name.empty ())
+  Utf8String utf8_name = {crate_name};
+  tl::optional<std::vector<Codepoint>> uchars_opt = utf8_name.get_chars ();
+
+  if (!uchars_opt.has_value ())
+    {
+      error = Error (UNDEF_LOCATION, "crate name is not a valid UTF-8 string");
+      return false;
+    }
+
+  std::vector<Codepoint> uchars = uchars_opt.value ();
+  if (uchars.empty ())
    {
      error = Error (UNDEF_LOCATION, "crate name cannot be empty");
      return false;
    }
-  if (crate_name.length () > kMaxNameLength)
+  if (uchars.size () > kMaxNameLength)
    {
      error = Error (UNDEF_LOCATION, "crate name cannot exceed %lu characters",
 		     (unsigned long) kMaxNameLength);
      return false;
    }
-  for (auto &c : crate_name)
+  for (Codepoint &c : uchars)
    {
-      if (!(ISALNUM (c) || c == '_'))
+      if (!(is_alphabetic (c.value) || is_numeric (c.value) || c.value == '_'))
 	{
 	  error = Error (UNDEF_LOCATION,
-			 "invalid character %<%c%> in crate name: %<%s%>", c,
-			 crate_name.c_str ());
+			 "invalid character %<%s%> in crate name: %<%s%>",
+			 c.as_string ().c_str (), crate_name.c_str ());
 	  return false;
 	}
    }
@ -1273,13 +1283,17 @@ rust_crate_name_validation_test (void)
  ASSERT_TRUE (Rust::validate_crate_name ("example", error));
  ASSERT_TRUE (Rust::validate_crate_name ("abcdefg_1234", error));
  ASSERT_TRUE (Rust::validate_crate_name ("1", error));
-  // FIXME: The next test does not pass as of current implementation
-  // ASSERT_TRUE (Rust::CompileOptions::validate_crate_name ("惊吓"));
+  ASSERT_TRUE (Rust::validate_crate_name ("クレート", error));
+  ASSERT_TRUE (Rust::validate_crate_name ("Sōkrátēs", error));
+  ASSERT_TRUE (Rust::validate_crate_name ("惊吓", error));
+
  // NOTE: - is not allowed in the crate name ...

  ASSERT_FALSE (Rust::validate_crate_name ("abcdefg-1234", error));
  ASSERT_FALSE (Rust::validate_crate_name ("a+b", error));
  ASSERT_FALSE (Rust::validate_crate_name ("/a+b/", error));
+  ASSERT_FALSE (Rust::validate_crate_name ("😸++", error));
+  ASSERT_FALSE (Rust::validate_crate_name ("∀", error));

  /* Tests for crate name inference */
  ASSERT_EQ (Rust::infer_crate_name ("c.rs"), "c");
--- a/gcc/rust/util/rust-unicode.cc
+++ b/gcc/rust/util/rust-unicode.cc
@ -12,6 +12,7 @@ typedef std::vector<codepoint_t> string_t;
 template <std::size_t SIZE>
 int64_t
 binary_search_ranges (
+  // FIXME: use binray search function from <algorithm>
  const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
  uint32_t target_cp)
 {
@ -49,6 +50,7 @@ int64_t
 binary_search_sorted_array (const std::array<uint32_t, SIZE> &array,
 			    uint32_t target)
 {
+  // FIXME: use binray search function from <algorithm>
  if (SIZE == 0)
    return -1;

@ -104,9 +106,7 @@ recursive_decomp_cano (codepoint_t c, string_t &buf)
    {
      string_t decomped = it->second;
      for (codepoint_t cp : decomped)
-	{
-	  recursive_decomp_cano (cp, buf);
-	}
+	recursive_decomp_cano (cp, buf);
    }
  else
    buf.push_back (c);
@ -152,8 +152,7 @@ recomp (string_t s)
  if (s.size () > 0)
    {
      int last_class = -1;
-      // int starter_pos = 0; // Assume the first character is Starter. Correct?
-      // int target_pos = 1;
+      // Assume the first character is Starter.
      codepoint_t starter_ch = s[0];
      for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
 	{
@ -189,20 +188,6 @@ recomp (string_t s)
  return buf;
 }

-// TODO: remove
-/*
-void
-dump_string (std::vector<uint32_t> s)
-{
-  std::cout << "dump=";
-  for (auto c : s)
-    {
-      std::cout << std::hex << c << ", ";
-    }
-  std::cout << std::endl;
-}
-*/
-
 string_t
 nfc_normalize (string_t s)
 {
--- a/gcc/rust/util/rust-unicode.h
+++ b/gcc/rust/util/rust-unicode.h
@ -19,10 +19,29 @@
 #ifndef RUST_UNICODE_H
 #define RUST_UNICODE_H

+#include "optional.h"
 #include "rust-system.h"
+#include "rust-lex.h"

 namespace Rust {

+class Utf8String
+{
+private:
+  tl::optional<std::vector<Codepoint>> chars;
+
+public:
+  Utf8String (const std::string &maybe_utf8)
+  {
+    Lexer::BufferInputSource input_source = {maybe_utf8, 0};
+    chars = input_source.get_chars ();
+  }
+
+  // Returns UTF codepoints when string is valid as UTF-8, returns nullopt
+  // otherwise.
+  tl::optional<std::vector<Codepoint>> get_chars () const { return chars; }
+};
+
 // TODO: add function nfc_normalize

 bool
--- a/gcc/testsuite/rust/compile/bad-crate-name1.rs
+++ b/gcc/testsuite/rust/compile/bad-crate-name1.rs
--- a/gcc/testsuite/rust/compile/bad-crate-name2.rs
+++ b/gcc/testsuite/rust/compile/bad-crate-name2.rs
@ -0,0 +1,2 @@
+#![crate_name = "😅"] // { dg-error "invalid character ...." "" }
+fn main() {}