Support UTF-8 identifiers in C/C++ expressions (PR gdb/22973)

Factor out cp_ident_is_alpha/cp_ident_is_alnum out of
gdb/cp-name-parser.y and use it in the C/C++ expression parser too.

New test included.

gdb/ChangeLog:
2018-05-22  Pedro Alves  <palves@redhat.com>
	    張俊芝  <zjz@zjz.name>

	PR gdb/22973
	* c-exp.y: Include "c-support.h".
	(parse_number, c_parse_escape, lex_one_token): Use TOLOWER instead
	of tolower.  Use c_ident_is_alpha to scan names.
	* c-lang.c: Include "c-support.h".
	(convert_ucn, convert_octal, convert_hex, convert_escape): Use
	ISXDIGIT instead of isxdigit and ISDIGIT instead of isdigit.
	* c-support.h: New file, with bits factored out from ...
	* cp-name-parser.y: ... this file.
	Include "c-support.h".
	(cp_ident_is_alpha, cp_ident_is_alnum): Deleted, moved to
	c-support.h and renamed.
	(symbol_end, yylex): Adjust.

gdb/testsuite/ChangeLog:
2018-05-22  Pedro Alves  <palves@redhat.com>

	PR gdb/22973
	* gdb.base/utf8-identifiers.c: New file.
	* gdb.base/utf8-identifiers.exp: New file.
This commit is contained in:
Pedro Alves 2018-05-22 17:35:38 +01:00
parent 0ec848ad25
commit b1b60145ae
8 changed files with 240 additions and 44 deletions

View file

@ -1,3 +1,20 @@
2018-05-22 Pedro Alves <palves@redhat.com>
張俊芝 <zjz@zjz.name>
PR gdb/22973
* c-exp.y: Include "c-support.h".
(parse_number, c_parse_escape, lex_one_token): Use TOLOWER instead
of tolower. Use c_ident_is_alpha to scan names.
* c-lang.c: Include "c-support.h".
(convert_ucn, convert_octal, convert_hex, convert_escape): Use
ISXDIGIT instead of isxdigit and ISDIGIT instead of isdigit.
* c-support.h: New file, with bits factored out from ...
* cp-name-parser.y: ... this file.
Include "c-support.h".
(cp_ident_is_alpha, cp_ident_is_alnum): Deleted, moved to
c-support.h and renamed.
(symbol_end, yylex): Adjust.
2018-05-22 Pedro Franco de Carvalho <pedromfc@linux.vnet.ibm.com>
* arch/ppc-linux-common.c (ppc_linux_has_isa205): Change the

View file

@ -42,6 +42,7 @@
#include "parser-defs.h"
#include "language.h"
#include "c-lang.h"
#include "c-support.h"
#include "bfd.h" /* Required by objfiles.h. */
#include "symfile.h" /* Required by objfiles.h. */
#include "objfiles.h" /* For have_full_symbols and have_partial_symbols */
@ -1806,13 +1807,13 @@ parse_number (struct parser_state *par_state,
len -= 2;
}
/* Handle suffixes: 'f' for float, 'l' for long double. */
else if (len >= 1 && tolower (p[len - 1]) == 'f')
else if (len >= 1 && TOLOWER (p[len - 1]) == 'f')
{
putithere->typed_val_float.type
= parse_type (par_state)->builtin_float;
len -= 1;
}
else if (len >= 1 && tolower (p[len - 1]) == 'l')
else if (len >= 1 && TOLOWER (p[len - 1]) == 'l')
{
putithere->typed_val_float.type
= parse_type (par_state)->builtin_long_double;
@ -2023,9 +2024,9 @@ c_parse_escape (const char **ptr, struct obstack *output)
if (output)
obstack_grow_str (output, "\\x");
++tokptr;
if (!isxdigit (*tokptr))
if (!ISXDIGIT (*tokptr))
error (_("\\x escape without a following hex digit"));
while (isxdigit (*tokptr))
while (ISXDIGIT (*tokptr))
{
if (output)
obstack_1grow (output, *tokptr);
@ -2048,7 +2049,7 @@ c_parse_escape (const char **ptr, struct obstack *output)
if (output)
obstack_grow_str (output, "\\");
for (i = 0;
i < 3 && isdigit (*tokptr) && *tokptr != '8' && *tokptr != '9';
i < 3 && ISDIGIT (*tokptr) && *tokptr != '8' && *tokptr != '9';
++i)
{
if (output)
@ -2073,9 +2074,9 @@ c_parse_escape (const char **ptr, struct obstack *output)
obstack_1grow (output, *tokptr);
}
++tokptr;
if (!isxdigit (*tokptr))
if (!ISXDIGIT (*tokptr))
error (_("\\%c escape without a following hex digit"), c);
for (i = 0; i < len && isxdigit (*tokptr); ++i)
for (i = 0; i < len && ISXDIGIT (*tokptr); ++i)
{
if (output)
obstack_1grow (output, *tokptr);
@ -2668,7 +2669,7 @@ lex_one_token (struct parser_state *par_state, bool *is_quoted_name)
size_t len = strlen ("selector");
if (strncmp (p, "selector", len) == 0
&& (p[len] == '\0' || isspace (p[len])))
&& (p[len] == '\0' || ISSPACE (p[len])))
{
lexptr = p + len;
return SELECTOR;
@ -2677,9 +2678,9 @@ lex_one_token (struct parser_state *par_state, bool *is_quoted_name)
goto parse_string;
}
while (isspace (*p))
while (ISSPACE (*p))
p++;
if (strncmp (p, "entry", len) == 0 && !isalnum (p[len])
if (strncmp (p, "entry", len) == 0 && !c_ident_is_alnum (p[len])
&& p[len] != '_')
{
lexptr = &p[len];
@ -2741,16 +2742,14 @@ lex_one_token (struct parser_state *par_state, bool *is_quoted_name)
}
}
if (!(c == '_' || c == '$'
|| (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')))
if (!(c == '_' || c == '$' || c_ident_is_alpha (c)))
/* We must have come across a bad character (e.g. ';'). */
error (_("Invalid character '%c' in expression."), c);
/* It's a name. See how long it is. */
namelen = 0;
for (c = tokstart[namelen];
(c == '_' || c == '$' || (c >= '0' && c <= '9')
|| (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '<');)
(c == '_' || c == '$' || c_ident_is_alnum (c) || c == '<');)
{
/* Template parameter lists are part of the name.
FIXME: This mishandles `print $a<4&&$a>3'. */

View file

@ -25,6 +25,7 @@
#include "language.h"
#include "varobj.h"
#include "c-lang.h"
#include "c-support.h"
#include "valprint.h"
#include "macroscope.h"
#include "charset.h"
@ -382,7 +383,7 @@ convert_ucn (char *p, char *limit, const char *dest_charset,
gdb_byte data[4];
int i;
for (i = 0; i < length && p < limit && isxdigit (*p); ++i, ++p)
for (i = 0; i < length && p < limit && ISXDIGIT (*p); ++i, ++p)
result = (result << 4) + host_hex_value (*p);
for (i = 3; i >= 0; --i)
@ -424,7 +425,7 @@ convert_octal (struct type *type, char *p,
unsigned long value = 0;
for (i = 0;
i < 3 && p < limit && isdigit (*p) && *p != '8' && *p != '9';
i < 3 && p < limit && ISDIGIT (*p) && *p != '8' && *p != '9';
++i)
{
value = 8 * value + host_hex_value (*p);
@ -447,7 +448,7 @@ convert_hex (struct type *type, char *p,
{
unsigned long value = 0;
while (p < limit && isxdigit (*p))
while (p < limit && ISXDIGIT (*p))
{
value = 16 * value + host_hex_value (*p);
++p;
@ -488,7 +489,7 @@ convert_escape (struct type *type, const char *dest_charset,
case 'x':
ADVANCE;
if (!isxdigit (*p))
if (!ISXDIGIT (*p))
error (_("\\x used with no following hex digits."));
p = convert_hex (type, p, limit, output);
break;
@ -510,7 +511,7 @@ convert_escape (struct type *type, const char *dest_charset,
int length = *p == 'u' ? 4 : 8;
ADVANCE;
if (!isxdigit (*p))
if (!ISXDIGIT (*p))
error (_("\\u used with no following hex digits"));
p = convert_ucn (p, limit, dest_charset, output, length);
}

46
gdb/c-support.h Normal file
View file

@ -0,0 +1,46 @@
/* Helper routines for C support in GDB.
Copyright (C) 2017-2018 Free Software Foundation, Inc.
This file is part of GDB.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */
#ifndef C_SUPPORT_H
#define C_SUPPORT_H
#include "safe-ctype.h"
/* Like ISALPHA, but also returns true for the union of all UTF-8
multi-byte sequence bytes and non-ASCII characters in
extended-ASCII charsets (e.g., Latin1). I.e., returns true if the
high bit is set. Note that not all UTF-8 ranges are allowed in C++
identifiers, but we don't need to be pedantic so for simplicity we
ignore that here. Plus this avoids the complication of actually
knowing what was the right encoding. */
static inline bool
c_ident_is_alpha (unsigned char ch)
{
return ISALPHA (ch) || ch >= 0x80;
}
/* Similarly, but Like ISALNUM. */
static inline bool
c_ident_is_alnum (unsigned char ch)
{
return ISALNUM (ch) || ch >= 0x80;
}
#endif /* C_SUPPORT_H */

View file

@ -35,6 +35,7 @@
#include "safe-ctype.h"
#include "demangle.h"
#include "cp-support.h"
#include "c-support.h"
/* Bison does not make it easy to create a parser without global
state, unfortunately. Here are all the global variables used
@ -1304,28 +1305,6 @@ d_binary (const char *name, struct demangle_component *lhs, struct demangle_comp
fill_comp (DEMANGLE_COMPONENT_BINARY_ARGS, lhs, rhs));
}
/* Like ISALPHA, but also returns true for the union of all UTF-8
multi-byte sequence bytes and non-ASCII characters in
extended-ASCII charsets (e.g., Latin1). I.e., returns true if the
high bit is set. Note that not all UTF-8 ranges are allowed in C++
identifiers, but we don't need to be pedantic so for simplicity we
ignore that here. Plus this avoids the complication of actually
knowing what was the right encoding. */
static inline bool
cp_ident_is_alpha (unsigned char ch)
{
return ISALPHA (ch) || ch >= 0x80;
}
/* Similarly, but Like ISALNUM. */
static inline bool
cp_ident_is_alnum (unsigned char ch)
{
return ISALNUM (ch) || ch >= 0x80;
}
/* Find the end of a symbol name starting at LEXPTR. */
static const char *
@ -1333,7 +1312,7 @@ symbol_end (const char *lexptr)
{
const char *p = lexptr;
while (*p && (cp_ident_is_alnum (*p) || *p == '_' || *p == '$' || *p == '.'))
while (*p && (c_ident_is_alnum (*p) || *p == '_' || *p == '$' || *p == '.'))
p++;
return p;
@ -1813,7 +1792,7 @@ yylex (void)
return ERROR;
}
if (!(c == '_' || c == '$' || cp_ident_is_alpha (c)))
if (!(c == '_' || c == '$' || c_ident_is_alpha (c)))
{
/* We must have come across a bad character (e.g. ';'). */
yyerror (_("invalid character"));
@ -1824,7 +1803,7 @@ yylex (void)
namelen = 0;
do
c = tokstart[++namelen];
while (cp_ident_is_alnum (c) || c == '_' || c == '$');
while (c_ident_is_alnum (c) || c == '_' || c == '$');
lexptr += namelen;

View file

@ -1,3 +1,9 @@
2018-05-22 Pedro Alves <palves@redhat.com>
PR gdb/22973
* gdb.base/utf8-identifiers.c: New file.
* gdb.base/utf8-identifiers.exp: New file.
2018-05-22 Pedro Franco de Carvalho <pedromfc@linux.vnet.ibm.com>
* gdb.arch/powerpc-fpscr-gcore.exp: New file.

View file

@ -0,0 +1,71 @@
/* -*- coding: utf-8 -*- */
/* This testcase is part of GDB, the GNU debugger.
Copyright 2017-2018 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/* UTF-8 "função1". */
#define FUNCAO1 fun\u00e7\u00e3o1
/* UTF-8 "função2". */
#define FUNCAO2 fun\u00e7\u00e3o2
/* UTF-8 "my_função". */
#define MY_FUNCAO my_fun\u00e7\u00e3o
/* UTF-8 "num_€". */
#define NUM_EUROS num_\u20ac
struct S
{
int NUM_EUROS;
} g_s;
void
FUNCAO1 (void)
{
g_s.NUM_EUROS = 1000;
}
void
FUNCAO2 (void)
{
g_s.NUM_EUROS = 1000;
}
void
MY_FUNCAO (void)
{
}
int NUM_EUROS = 2000;
static void
done ()
{
}
int
main ()
{
FUNCAO1 ();
done ();
FUNCAO2 ();
MY_FUNCAO ();
return 0;
}

View file

@ -0,0 +1,77 @@
# -*- coding: utf-8 -*- */
# This testcase is part of GDB, the GNU debugger.
# Copyright 2017-2018 Free Software Foundation, Inc.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Test GDB's support for UTF-8 C/C++ identifiers.
load_lib completion-support.exp
standard_testfile
# Enable basic use of UTF-8. LC_ALL gets reset for each testfile.
setenv LC_ALL C.UTF-8
if { [prepare_for_testing "failed to prepare" ${testfile} [list $srcfile]] } {
return -1
}
if ![runto done] {
fail "couldn't run to done"
return
}
# Test expressions.
gdb_test "print g_s.num_€" " = 1000"
gdb_test "print num_€" " = 2000"
# Test linespecs/breakpoints.
gdb_test "break função2" "Breakpoint $decimal at .*$srcfile.*"
set test "info breakpoints"
gdb_test_multiple $test $test {
-re "in função2 at .*$srcfile.*$gdb_prompt $" {
pass $test
}
}
gdb_test "continue" \
"Breakpoint $decimal, função2 \\(\\) at .*$srcfile.*"
# Unload symbols from shared libraries to avoid random symbol and file
# names getting in the way of completion.
gdb_test_no_output "nosharedlibrary"
# Test linespec completion.
# A unique completion.
test_gdb_complete_unique "break my_fun" "break my_função"
# A multiple-matches completion:
# kfailed because gdb/readline display the completion match list like
# this, with no separating space:
#
# (gdb) break função[TAB]
# função1função2
#
# ... which is bogus.
setup_kfail "gdb/23211" "*-*-*"
test_gdb_complete_multiple "break " "fun" "ção" {"função1" "função2"}
# Test expression completion.
test_gdb_complete_unique "print g_s.num" "print g_s.num_€"