Support UTF-8 identifiers in C/C++ expressions (PR gdb/22973)

Factor out cp_ident_is_alpha/cp_ident_is_alnum out of gdb/cp-name-parser.y and use it in the C/C++ expression parser too. New test included. gdb/ChangeLog: 2018-05-22 Pedro Alves <palves@redhat.com> 張俊芝 <zjz@zjz.name> PR gdb/22973 * c-exp.y: Include "c-support.h". (parse_number, c_parse_escape, lex_one_token): Use TOLOWER instead of tolower. Use c_ident_is_alpha to scan names. * c-lang.c: Include "c-support.h". (convert_ucn, convert_octal, convert_hex, convert_escape): Use ISXDIGIT instead of isxdigit and ISDIGIT instead of isdigit. * c-support.h: New file, with bits factored out from ... * cp-name-parser.y: ... this file. Include "c-support.h". (cp_ident_is_alpha, cp_ident_is_alnum): Deleted, moved to c-support.h and renamed. (symbol_end, yylex): Adjust. gdb/testsuite/ChangeLog: 2018-05-22 Pedro Alves <palves@redhat.com> PR gdb/22973 * gdb.base/utf8-identifiers.c: New file. * gdb.base/utf8-identifiers.exp: New file.
2018-05-22 17:35:38 +01:00 · 2018-05-22 17:35:38 +01:00 · b1b60145ae
commit b1b60145ae
parent 0ec848ad25
8 changed files with 240 additions and 44 deletions
--- a/gdb/ChangeLog
+++ b/gdb/ChangeLog
@ -1,3 +1,20 @@
+2018-05-22  Pedro Alves  <palves@redhat.com>
+	    張俊芝  <zjz@zjz.name>
+
+	PR gdb/22973
+	* c-exp.y: Include "c-support.h".
+	(parse_number, c_parse_escape, lex_one_token): Use TOLOWER instead
+	of tolower.  Use c_ident_is_alpha to scan names.
+	* c-lang.c: Include "c-support.h".
+	(convert_ucn, convert_octal, convert_hex, convert_escape): Use
+	ISXDIGIT instead of isxdigit and ISDIGIT instead of isdigit.
+	* c-support.h: New file, with bits factored out from ...
+	* cp-name-parser.y: ... this file.
+	Include "c-support.h".
+	(cp_ident_is_alpha, cp_ident_is_alnum): Deleted, moved to
+	c-support.h and renamed.
+	(symbol_end, yylex): Adjust.
+
 2018-05-22  Pedro Franco de Carvalho  <pedromfc@linux.vnet.ibm.com>

 	* arch/ppc-linux-common.c (ppc_linux_has_isa205): Change the
--- a/gdb/c-exp.y
+++ b/gdb/c-exp.y
@ -42,6 +42,7 @@
 #include "parser-defs.h"
 #include "language.h"
 #include "c-lang.h"
+#include "c-support.h"
 #include "bfd.h" /* Required by objfiles.h.  */
 #include "symfile.h" /* Required by objfiles.h.  */
 #include "objfiles.h" /* For have_full_symbols and have_partial_symbols */
@ -1806,13 +1807,13 @@ parse_number (struct parser_state *par_state,
 	  len -= 2;
 	}
      /* Handle suffixes: 'f' for float, 'l' for long double.  */
-      else if (len >= 1 && tolower (p[len - 1]) == 'f')
+      else if (len >= 1 && TOLOWER (p[len - 1]) == 'f')
 	{
 	  putithere->typed_val_float.type
 	    = parse_type (par_state)->builtin_float;
 	  len -= 1;
 	}
-      else if (len >= 1 && tolower (p[len - 1]) == 'l')
+      else if (len >= 1 && TOLOWER (p[len - 1]) == 'l')
 	{
 	  putithere->typed_val_float.type
 	    = parse_type (par_state)->builtin_long_double;
@ -2023,9 +2024,9 @@ c_parse_escape (const char **ptr, struct obstack *output)
      if (output)
 	obstack_grow_str (output, "\\x");
      ++tokptr;
-      if (!isxdigit (*tokptr))
+      if (!ISXDIGIT (*tokptr))
 	error (_("\\x escape without a following hex digit"));
-      while (isxdigit (*tokptr))
+      while (ISXDIGIT (*tokptr))
 	{
 	  if (output)
 	    obstack_1grow (output, *tokptr);
@ -2048,7 +2049,7 @@ c_parse_escape (const char **ptr, struct obstack *output)
 	if (output)
 	  obstack_grow_str (output, "\\");
 	for (i = 0;
-	     i < 3 && isdigit (*tokptr) && *tokptr != '8' && *tokptr != '9';
+	     i < 3 && ISDIGIT (*tokptr) && *tokptr != '8' && *tokptr != '9';
 	     ++i)
 	  {
 	    if (output)
@ -2073,9 +2074,9 @@ c_parse_escape (const char **ptr, struct obstack *output)
 	    obstack_1grow (output, *tokptr);
 	  }
 	++tokptr;
-	if (!isxdigit (*tokptr))
+	if (!ISXDIGIT (*tokptr))
 	  error (_("\\%c escape without a following hex digit"), c);
-	for (i = 0; i < len && isxdigit (*tokptr); ++i)
+	for (i = 0; i < len && ISXDIGIT (*tokptr); ++i)
 	  {
 	    if (output)
 	      obstack_1grow (output, *tokptr);
@ -2668,7 +2669,7 @@ lex_one_token (struct parser_state *par_state, bool *is_quoted_name)
 	    size_t len = strlen ("selector");

 	    if (strncmp (p, "selector", len) == 0
-		&& (p[len] == '\0' || isspace (p[len])))
+		&& (p[len] == '\0' || ISSPACE (p[len])))
 	      {
 		lexptr = p + len;
 		return SELECTOR;
@ -2677,9 +2678,9 @@ lex_one_token (struct parser_state *par_state, bool *is_quoted_name)
 	      goto parse_string;
 	  }

-	while (isspace (*p))
+	while (ISSPACE (*p))
 	  p++;
-	if (strncmp (p, "entry", len) == 0 && !isalnum (p[len])
+	if (strncmp (p, "entry", len) == 0 && !c_ident_is_alnum (p[len])
 	    && p[len] != '_')
 	  {
 	    lexptr = &p[len];
@ -2741,16 +2742,14 @@ lex_one_token (struct parser_state *par_state, bool *is_quoted_name)
      }
    }

-  if (!(c == '_' || c == '$'
-	|| (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')))
+  if (!(c == '_' || c == '$' || c_ident_is_alpha (c)))
    /* We must have come across a bad character (e.g. ';').  */
    error (_("Invalid character '%c' in expression."), c);

  /* It's a name.  See how long it is.  */
  namelen = 0;
  for (c = tokstart[namelen];
-       (c == '_' || c == '$' || (c >= '0' && c <= '9')
-	|| (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '<');)
+       (c == '_' || c == '$' || c_ident_is_alnum (c) || c == '<');)
    {
      /* Template parameter lists are part of the name.
 	 FIXME: This mishandles `print $a<4&&$a>3'.  */
--- a/gdb/c-lang.c
+++ b/gdb/c-lang.c
@ -25,6 +25,7 @@
 #include "language.h"
 #include "varobj.h"
 #include "c-lang.h"
+#include "c-support.h"
 #include "valprint.h"
 #include "macroscope.h"
 #include "charset.h"
@ -382,7 +383,7 @@ convert_ucn (char *p, char *limit, const char *dest_charset,
  gdb_byte data[4];
  int i;

-  for (i = 0; i < length && p < limit && isxdigit (*p); ++i, ++p)
+  for (i = 0; i < length && p < limit && ISXDIGIT (*p); ++i, ++p)
    result = (result << 4) + host_hex_value (*p);

  for (i = 3; i >= 0; --i)
@ -424,7 +425,7 @@ convert_octal (struct type *type, char *p,
  unsigned long value = 0;

  for (i = 0;
-       i < 3 && p < limit && isdigit (*p) && *p != '8' && *p != '9';
+       i < 3 && p < limit && ISDIGIT (*p) && *p != '8' && *p != '9';
       ++i)
    {
      value = 8 * value + host_hex_value (*p);
@ -447,7 +448,7 @@ convert_hex (struct type *type, char *p,
 {
  unsigned long value = 0;

-  while (p < limit && isxdigit (*p))
+  while (p < limit && ISXDIGIT (*p))
    {
      value = 16 * value + host_hex_value (*p);
      ++p;
@ -488,7 +489,7 @@ convert_escape (struct type *type, const char *dest_charset,

    case 'x':
      ADVANCE;
-      if (!isxdigit (*p))
+      if (!ISXDIGIT (*p))
 	error (_("\\x used with no following hex digits."));
      p = convert_hex (type, p, limit, output);
      break;
@ -510,7 +511,7 @@ convert_escape (struct type *type, const char *dest_charset,
 	int length = *p == 'u' ? 4 : 8;

 	ADVANCE;
-	if (!isxdigit (*p))
+	if (!ISXDIGIT (*p))
 	  error (_("\\u used with no following hex digits"));
 	p = convert_ucn (p, limit, dest_charset, output, length);
      }
--- a/gdb/c-support.h
+++ b/gdb/c-support.h
@ -0,0 +1,46 @@
+/* Helper routines for C support in GDB.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   This file is part of GDB.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#ifndef C_SUPPORT_H
+#define C_SUPPORT_H
+
+#include "safe-ctype.h"
+
+/* Like ISALPHA, but also returns true for the union of all UTF-8
+   multi-byte sequence bytes and non-ASCII characters in
+   extended-ASCII charsets (e.g., Latin1).  I.e., returns true if the
+   high bit is set.  Note that not all UTF-8 ranges are allowed in C++
+   identifiers, but we don't need to be pedantic so for simplicity we
+   ignore that here.  Plus this avoids the complication of actually
+   knowing what was the right encoding.  */
+
+static inline bool
+c_ident_is_alpha (unsigned char ch)
+{
+  return ISALPHA (ch) || ch >= 0x80;
+}
+
+/* Similarly, but Like ISALNUM.  */
+
+static inline bool
+c_ident_is_alnum (unsigned char ch)
+{
+  return ISALNUM (ch) || ch >= 0x80;
+}
+
+#endif /* C_SUPPORT_H */
--- a/gdb/cp-name-parser.y
+++ b/gdb/cp-name-parser.y
@ -35,6 +35,7 @@
 #include "safe-ctype.h"
 #include "demangle.h"
 #include "cp-support.h"
+#include "c-support.h"

 /* Bison does not make it easy to create a parser without global
   state, unfortunately.  Here are all the global variables used
@ -1304,28 +1305,6 @@ d_binary (const char *name, struct demangle_component *lhs, struct demangle_comp
 		      fill_comp (DEMANGLE_COMPONENT_BINARY_ARGS, lhs, rhs));
 }

-/* Like ISALPHA, but also returns true for the union of all UTF-8
-   multi-byte sequence bytes and non-ASCII characters in
-   extended-ASCII charsets (e.g., Latin1).  I.e., returns true if the
-   high bit is set.  Note that not all UTF-8 ranges are allowed in C++
-   identifiers, but we don't need to be pedantic so for simplicity we
-   ignore that here.  Plus this avoids the complication of actually
-   knowing what was the right encoding.  */
-
-static inline bool
-cp_ident_is_alpha (unsigned char ch)
-{
-  return ISALPHA (ch) || ch >= 0x80;
-}
-
-/* Similarly, but Like ISALNUM.  */
-
-static inline bool
-cp_ident_is_alnum (unsigned char ch)
-{
-  return ISALNUM (ch) || ch >= 0x80;
-}
-
 /* Find the end of a symbol name starting at LEXPTR.  */

 static const char *
@ -1333,7 +1312,7 @@ symbol_end (const char *lexptr)
 {
  const char *p = lexptr;

-  while (*p && (cp_ident_is_alnum (*p) || *p == '_' || *p == '$' || *p == '.'))
+  while (*p && (c_ident_is_alnum (*p) || *p == '_' || *p == '$' || *p == '.'))
    p++;

  return p;
@ -1813,7 +1792,7 @@ yylex (void)
      return ERROR;
    }

-  if (!(c == '_' || c == '$' || cp_ident_is_alpha (c)))
+  if (!(c == '_' || c == '$' || c_ident_is_alpha (c)))
    {
      /* We must have come across a bad character (e.g. ';').  */
      yyerror (_("invalid character"));
@ -1824,7 +1803,7 @@ yylex (void)
  namelen = 0;
  do
    c = tokstart[++namelen];
-  while (cp_ident_is_alnum (c) || c == '_' || c == '$');
+  while (c_ident_is_alnum (c) || c == '_' || c == '$');

  lexptr += namelen;

--- a/gdb/testsuite/ChangeLog
+++ b/gdb/testsuite/ChangeLog
@ -1,3 +1,9 @@
+2018-05-22  Pedro Alves  <palves@redhat.com>
+
+	PR gdb/22973
+	* gdb.base/utf8-identifiers.c: New file.
+	* gdb.base/utf8-identifiers.exp: New file.
+
 2018-05-22  Pedro Franco de Carvalho  <pedromfc@linux.vnet.ibm.com>

 	* gdb.arch/powerpc-fpscr-gcore.exp: New file.
--- a/gdb/testsuite/gdb.base/utf8-identifiers.c
+++ b/gdb/testsuite/gdb.base/utf8-identifiers.c
@ -0,0 +1,71 @@
+/* -*- coding: utf-8 -*- */
+
+/* This testcase is part of GDB, the GNU debugger.
+
+   Copyright 2017-2018 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* UTF-8 "função1".  */
+#define FUNCAO1 fun\u00e7\u00e3o1
+
+/* UTF-8 "função2".  */
+#define FUNCAO2 fun\u00e7\u00e3o2
+
+/* UTF-8 "my_função".  */
+#define MY_FUNCAO my_fun\u00e7\u00e3o
+
+/* UTF-8 "num_€".  */
+#define NUM_EUROS num_\u20ac
+
+struct S
+{
+  int NUM_EUROS;
+} g_s;
+
+void
+FUNCAO1 (void)
+{
+  g_s.NUM_EUROS = 1000;
+}
+
+void
+FUNCAO2 (void)
+{
+  g_s.NUM_EUROS = 1000;
+}
+
+void
+MY_FUNCAO (void)
+{
+}
+
+int NUM_EUROS = 2000;
+
+static void
+done ()
+{
+}
+
+int
+main ()
+{
+  FUNCAO1 ();
+  done ();
+  FUNCAO2 ();
+  MY_FUNCAO ();
+
+  return 0;
+}
--- a/gdb/testsuite/gdb.base/utf8-identifiers.exp
+++ b/gdb/testsuite/gdb.base/utf8-identifiers.exp
@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*- */
+
+# This testcase is part of GDB, the GNU debugger.
+
+# Copyright 2017-2018 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Test GDB's support for UTF-8 C/C++ identifiers.
+
+load_lib completion-support.exp
+
+standard_testfile
+
+# Enable basic use of UTF-8.  LC_ALL gets reset for each testfile.
+setenv LC_ALL C.UTF-8
+
+if { [prepare_for_testing "failed to prepare" ${testfile} [list $srcfile]] } {
+    return -1
+}
+
+if ![runto done] {
+    fail "couldn't run to done"
+    return
+}
+
+# Test expressions.
+gdb_test "print g_s.num_€" " = 1000"
+gdb_test "print num_€" " = 2000"
+
+# Test linespecs/breakpoints.
+gdb_test "break função2" "Breakpoint $decimal at .*$srcfile.*"
+
+set test "info breakpoints"
+gdb_test_multiple $test $test {
+    -re "in função2 at .*$srcfile.*$gdb_prompt $" {
+	pass $test
+    }
+}
+
+gdb_test "continue" \
+    "Breakpoint $decimal, função2 \\(\\) at .*$srcfile.*"
+
+# Unload symbols from shared libraries to avoid random symbol and file
+# names getting in the way of completion.
+gdb_test_no_output "nosharedlibrary"
+
+# Test linespec completion.
+
+# A unique completion.
+test_gdb_complete_unique "break my_fun" "break my_função"
+
+# A multiple-matches completion:
+
+# kfailed because gdb/readline display the completion match list like
+# this, with no separating space:
+#
+#  (gdb) break função[TAB]
+#  função1função2
+#
+# ... which is bogus.
+setup_kfail "gdb/23211" "*-*-*"
+test_gdb_complete_multiple "break " "fun" "ção" {"função1" "função2"}
+
+# Test expression completion.
+test_gdb_complete_unique "print g_s.num" "print g_s.num_€"