gcc/libstdc++-v3/include/std/text_encoding
Jonathan Wakely df0a668b78 libstdc++: Implement C++26 std::text_encoding (P1885R12) [PR113318]
This is another C++26 change, approved in Varna 2023. We require a new
static array of data that is extracted from the IANA Character Sets
database. A new Python script to generate a header from the IANA CSV
file is added.

The text_encoding class is basically just a pointer to an {ID,name} pair
in the static array. The aliases view is also just the same pointer (or
empty), and the view's iterator moves forwards and backwards in the
array while the array elements have the same ID (or to one element
further, for a past-the-end iterator).

Because those iterators refer to a global array that never goes out of
scope, there's no reason they should every produce undefined behaviour
or indeterminate values.  They should either have well-defined
behaviour, or abort. The overhead of ensuring those properties is pretty
low, so seems worth it.

This means that an aliases_view iterator should never be able to access
out-of-bounds. A non-value-initialized iterator always points to an
element of the static array even when not dereferenceable (the array has
unreachable entries at the start and end, which means that even a
past-the-end iterator for the last encoding in the array still points to
valid memory).  Dereferencing an iterator can always return a valid
array element, or "" for a non-dereferenceable iterator (but doing so
will abort when assertions are enabled).  In the language being proposed
for C++26, dereferencing an invalid iterator erroneously returns "".
Attempting to increment/decrement past the last/first element in the
view is erroneously a no-op, so aborts when assertions are enabled, and
doesn't change value otherwise.

Similarly, constructing a std::text_encoding with an invalid id (one
that doesn't have the value of an enumerator) erroneously behaves the
same as constructing with id::unknown, or aborts with assertions
enabled.

libstdc++-v3/ChangeLog:

	PR libstdc++/113318
	* acinclude.m4 (GLIBCXX_CONFIGURE): Add c++26 directory.
	(GLIBCXX_CHECK_TEXT_ENCODING): Define.
	* config.h.in: Regenerate.
	* configure: Regenerate.
	* configure.ac: Use GLIBCXX_CHECK_TEXT_ENCODING.
	* include/Makefile.am: Add new headers.
	* include/Makefile.in: Regenerate.
	* include/bits/locale_classes.h (locale::encoding): Declare new
	member function.
	* include/bits/unicode.h (__charset_alias_match): New function.
	* include/bits/text_encoding-data.h: New file.
	* include/bits/version.def (text_encoding): Define.
	* include/bits/version.h: Regenerate.
	* include/std/text_encoding: New file.
	* src/Makefile.am: Add new subdirectory.
	* src/Makefile.in: Regenerate.
	* src/c++26/Makefile.am: New file.
	* src/c++26/Makefile.in: New file.
	* src/c++26/text_encoding.cc: New file.
	* src/experimental/Makefile.am: Include c++26 convenience
	library.
	* src/experimental/Makefile.in: Regenerate.
	* python/libstdcxx/v6/printers.py (StdTextEncodingPrinter): New
	printer.
	* scripts/gen_text_encoding_data.py: New file.
	* testsuite/22_locale/locale/encoding.cc: New test.
	* testsuite/ext/unicode/charset_alias_match.cc: New test.
	* testsuite/std/text_encoding/cons.cc: New test.
	* testsuite/std/text_encoding/members.cc: New test.
	* testsuite/std/text_encoding/requirements.cc: New test.

Reviewed-by: Ulrich Drepper <drepper.fsp@gmail.com>
Reviewed-by: Patrick Palka <ppalka@redhat.com>
2024-01-17 11:49:11 +00:00

678 lines
16 KiB
C++

// <text_encoding> -*- C++ -*-
// Copyright The GNU Toolchain Authors.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.
// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
// <http://www.gnu.org/licenses/>.
/** @file include/text_encoding
* This is a Standard C++ Library header.
*/
#ifndef _GLIBCXX_TEXT_ENCODING
#define _GLIBCXX_TEXT_ENCODING
#pragma GCC system_header
#include <bits/requires_hosted.h>
#define __glibcxx_want_text_encoding
#include <bits/version.h>
#ifdef __cpp_lib_text_encoding
#include <compare>
#include <string_view>
#include <bits/functional_hash.h> // hash
#include <bits/ranges_util.h> // view_interface
#include <bits/unicode.h> // __charset_alias_match
#include <ext/numeric_traits.h> // __int_traits
namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
/**
* @brief An interface for accessing the IANA Character Sets registry.
* @ingroup locales
* @since C++23
*/
struct text_encoding
{
private:
struct _Rep
{
using id = __INT_LEAST32_TYPE__;
id _M_id;
const char* _M_name;
friend constexpr bool
operator<(const _Rep& __r, id __m) noexcept
{ return __r._M_id < __m; }
friend constexpr bool
operator==(const _Rep& __r, string_view __name) noexcept
{ return __r._M_name == __name; }
};
public:
static constexpr size_t max_name_length = 63;
enum class id : _Rep::id
{
other = 1,
unknown = 2,
ASCII = 3,
ISOLatin1 = 4,
ISOLatin2 = 5,
ISOLatin3 = 6,
ISOLatin4 = 7,
ISOLatinCyrillic = 8,
ISOLatinArabic = 9,
ISOLatinGreek = 10,
ISOLatinHebrew = 11,
ISOLatin5 = 12,
ISOLatin6 = 13,
ISOTextComm = 14,
HalfWidthKatakana = 15,
JISEncoding = 16,
ShiftJIS = 17,
EUCPkdFmtJapanese = 18,
EUCFixWidJapanese = 19,
ISO4UnitedKingdom = 20,
ISO11SwedishForNames = 21,
ISO15Italian = 22,
ISO17Spanish = 23,
ISO21German = 24,
ISO60DanishNorwegian = 25,
ISO69French = 26,
ISO10646UTF1 = 27,
ISO646basic1983 = 28,
INVARIANT = 29,
ISO2IntlRefVersion = 30,
NATSSEFI = 31,
NATSSEFIADD = 32,
ISO10Swedish = 35,
KSC56011987 = 36,
ISO2022KR = 37,
EUCKR = 38,
ISO2022JP = 39,
ISO2022JP2 = 40,
ISO13JISC6220jp = 41,
ISO14JISC6220ro = 42,
ISO16Portuguese = 43,
ISO18Greek7Old = 44,
ISO19LatinGreek = 45,
ISO25French = 46,
ISO27LatinGreek1 = 47,
ISO5427Cyrillic = 48,
ISO42JISC62261978 = 49,
ISO47BSViewdata = 50,
ISO49INIS = 51,
ISO50INIS8 = 52,
ISO51INISCyrillic = 53,
ISO54271981 = 54,
ISO5428Greek = 55,
ISO57GB1988 = 56,
ISO58GB231280 = 57,
ISO61Norwegian2 = 58,
ISO70VideotexSupp1 = 59,
ISO84Portuguese2 = 60,
ISO85Spanish2 = 61,
ISO86Hungarian = 62,
ISO87JISX0208 = 63,
ISO88Greek7 = 64,
ISO89ASMO449 = 65,
ISO90 = 66,
ISO91JISC62291984a = 67,
ISO92JISC62991984b = 68,
ISO93JIS62291984badd = 69,
ISO94JIS62291984hand = 70,
ISO95JIS62291984handadd = 71,
ISO96JISC62291984kana = 72,
ISO2033 = 73,
ISO99NAPLPS = 74,
ISO102T617bit = 75,
ISO103T618bit = 76,
ISO111ECMACyrillic = 77,
ISO121Canadian1 = 78,
ISO122Canadian2 = 79,
ISO123CSAZ24341985gr = 80,
ISO88596E = 81,
ISO88596I = 82,
ISO128T101G2 = 83,
ISO88598E = 84,
ISO88598I = 85,
ISO139CSN369103 = 86,
ISO141JUSIB1002 = 87,
ISO143IECP271 = 88,
ISO146Serbian = 89,
ISO147Macedonian = 90,
ISO150 = 91,
ISO151Cuba = 92,
ISO6937Add = 93,
ISO153GOST1976874 = 94,
ISO8859Supp = 95,
ISO10367Box = 96,
ISO158Lap = 97,
ISO159JISX02121990 = 98,
ISO646Danish = 99,
USDK = 100,
DKUS = 101,
KSC5636 = 102,
Unicode11UTF7 = 103,
ISO2022CN = 104,
ISO2022CNEXT = 105,
UTF8 = 106,
ISO885913 = 109,
ISO885914 = 110,
ISO885915 = 111,
ISO885916 = 112,
GBK = 113,
GB18030 = 114,
OSDEBCDICDF0415 = 115,
OSDEBCDICDF03IRV = 116,
OSDEBCDICDF041 = 117,
ISO115481 = 118,
KZ1048 = 119,
UCS2 = 1000,
UCS4 = 1001,
UnicodeASCII = 1002,
UnicodeLatin1 = 1003,
UnicodeJapanese = 1004,
UnicodeIBM1261 = 1005,
UnicodeIBM1268 = 1006,
UnicodeIBM1276 = 1007,
UnicodeIBM1264 = 1008,
UnicodeIBM1265 = 1009,
Unicode11 = 1010,
SCSU = 1011,
UTF7 = 1012,
UTF16BE = 1013,
UTF16LE = 1014,
UTF16 = 1015,
CESU8 = 1016,
UTF32 = 1017,
UTF32BE = 1018,
UTF32LE = 1019,
BOCU1 = 1020,
UTF7IMAP = 1021,
Windows30Latin1 = 2000,
Windows31Latin1 = 2001,
Windows31Latin2 = 2002,
Windows31Latin5 = 2003,
HPRoman8 = 2004,
AdobeStandardEncoding = 2005,
VenturaUS = 2006,
VenturaInternational = 2007,
DECMCS = 2008,
PC850Multilingual = 2009,
PC8DanishNorwegian = 2012,
PC862LatinHebrew = 2013,
PC8Turkish = 2014,
IBMSymbols = 2015,
IBMThai = 2016,
HPLegal = 2017,
HPPiFont = 2018,
HPMath8 = 2019,
HPPSMath = 2020,
HPDesktop = 2021,
VenturaMath = 2022,
MicrosoftPublishing = 2023,
Windows31J = 2024,
GB2312 = 2025,
Big5 = 2026,
Macintosh = 2027,
IBM037 = 2028,
IBM038 = 2029,
IBM273 = 2030,
IBM274 = 2031,
IBM275 = 2032,
IBM277 = 2033,
IBM278 = 2034,
IBM280 = 2035,
IBM281 = 2036,
IBM284 = 2037,
IBM285 = 2038,
IBM290 = 2039,
IBM297 = 2040,
IBM420 = 2041,
IBM423 = 2042,
IBM424 = 2043,
PC8CodePage437 = 2011,
IBM500 = 2044,
IBM851 = 2045,
PCp852 = 2010,
IBM855 = 2046,
IBM857 = 2047,
IBM860 = 2048,
IBM861 = 2049,
IBM863 = 2050,
IBM864 = 2051,
IBM865 = 2052,
IBM868 = 2053,
IBM869 = 2054,
IBM870 = 2055,
IBM871 = 2056,
IBM880 = 2057,
IBM891 = 2058,
IBM903 = 2059,
IBM904 = 2060,
IBM905 = 2061,
IBM918 = 2062,
IBM1026 = 2063,
IBMEBCDICATDE = 2064,
EBCDICATDEA = 2065,
EBCDICCAFR = 2066,
EBCDICDKNO = 2067,
EBCDICDKNOA = 2068,
EBCDICFISE = 2069,
EBCDICFISEA = 2070,
EBCDICFR = 2071,
EBCDICIT = 2072,
EBCDICPT = 2073,
EBCDICES = 2074,
EBCDICESA = 2075,
EBCDICESS = 2076,
EBCDICUK = 2077,
EBCDICUS = 2078,
Unknown8BiT = 2079,
Mnemonic = 2080,
Mnem = 2081,
VISCII = 2082,
VIQR = 2083,
KOI8R = 2084,
HZGB2312 = 2085,
IBM866 = 2086,
PC775Baltic = 2087,
KOI8U = 2088,
IBM00858 = 2089,
IBM00924 = 2090,
IBM01140 = 2091,
IBM01141 = 2092,
IBM01142 = 2093,
IBM01143 = 2094,
IBM01144 = 2095,
IBM01145 = 2096,
IBM01146 = 2097,
IBM01147 = 2098,
IBM01148 = 2099,
IBM01149 = 2100,
Big5HKSCS = 2101,
IBM1047 = 2102,
PTCP154 = 2103,
Amiga1251 = 2104,
KOI7switched = 2105,
BRF = 2106,
TSCII = 2107,
CP51932 = 2108,
windows874 = 2109,
windows1250 = 2250,
windows1251 = 2251,
windows1252 = 2252,
windows1253 = 2253,
windows1254 = 2254,
windows1255 = 2255,
windows1256 = 2256,
windows1257 = 2257,
windows1258 = 2258,
TIS620 = 2259,
CP50220 = 2260
};
using enum id;
constexpr text_encoding() = default;
constexpr explicit
text_encoding(string_view __enc) noexcept
: _M_rep(_S_find_name(__enc))
{
__enc.copy(_M_name, max_name_length);
}
// @pre i has the value of one of the enumerators of id.
constexpr
text_encoding(id __i) noexcept
: _M_rep(_S_find_id(__i))
{
if (string_view __name(_M_rep->_M_name); !__name.empty())
__name.copy(_M_name, max_name_length);
}
constexpr id mib() const noexcept { return id(_M_rep->_M_id); }
constexpr const char* name() const noexcept { return _M_name; }
struct aliases_view : ranges::view_interface<aliases_view>
{
private:
class _Iterator;
struct _Sentinel { };
public:
constexpr _Iterator begin() const noexcept;
constexpr _Sentinel end() const noexcept { return {}; }
private:
friend struct text_encoding;
constexpr explicit aliases_view(const _Rep* __r) : _M_begin(__r) { }
const _Rep* _M_begin = nullptr;
};
constexpr aliases_view
aliases() const noexcept
{
return _M_rep->_M_name[0] ? aliases_view(_M_rep) : aliases_view{nullptr};
}
friend constexpr bool
operator==(const text_encoding& __a,
const text_encoding& __b) noexcept
{
if (__a.mib() == id::other && __b.mib() == id::other) [[unlikely]]
return _S_comp(__a._M_name, __b._M_name);
else
return __a.mib() == __b.mib();
}
friend constexpr bool
operator==(const text_encoding& __encoding, id __i) noexcept
{ return __encoding.mib() == __i; }
#if __CHAR_BIT__ == 8
static consteval text_encoding
literal() noexcept
{
#ifdef __GNUC_EXECUTION_CHARSET_NAME
return text_encoding(__GNUC_EXECUTION_CHARSET_NAME);
#elif defined __clang_literal_encoding__
return text_encoding(__clang_literal_encoding__);
#else
return text_encoding();
#endif
}
static text_encoding
environment();
template<id _Id>
static bool
environment_is()
{ return text_encoding(_Id)._M_is_environment(); }
#else
static text_encoding literal() = delete;
static text_encoding environment() = delete;
template<id> static bool environment_is() = delete;
#endif
private:
const _Rep* _M_rep = _S_reps + 1; // id::unknown
char _M_name[max_name_length + 1] = {0};
bool
_M_is_environment() const;
static inline constexpr _Rep _S_reps[] = {
{ 1, "" }, { 2, "" },
#define _GLIBCXX_GET_ENCODING_DATA
#include <bits/text_encoding-data.h>
#ifdef _GLIBCXX_GET_ENCODING_DATA
# error "Invalid text_encoding data"
#endif
{ 9999, nullptr }, // sentinel
};
static constexpr bool
_S_comp(string_view __a, string_view __b)
{ return __unicode::__charset_alias_match(__a, __b); }
static constexpr const _Rep*
_S_find_name(string_view __name) noexcept
{
#ifdef _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET
// Optimize the common UTF-8 case to avoid a linear search through all
// strings in the table using the _S_comp function.
if (__name == "UTF-8")
return _S_reps + 2 + _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET;
#endif
// The first two array elements (other and unknown) don't have names.
// The last element is a sentinel that can never match anything.
const auto __first = _S_reps + 2, __end = std::end(_S_reps) - 1;
for (auto __r = __first; __r != __end; ++__r)
if (_S_comp(__r->_M_name, __name))
{
// Might have matched an alias. Find the first entry for this ID.
const auto __id = __r->_M_id;
while (__r[-1]._M_id == __id)
--__r;
return __r;
}
return _S_reps; // id::other
}
static constexpr const _Rep*
_S_find_id(id __id) noexcept
{
const auto __i = (_Rep::id)__id;
const auto __r = std::lower_bound(_S_reps, std::end(_S_reps) - 1, __i);
if (__r->_M_id == __i) [[likely]]
return __r;
else
{
// Preconditions: i has the value of one of the enumerators of id.
__glibcxx_assert(__r->_M_id == __i);
return _S_reps + 1; // id::unknown
}
}
};
template<>
struct hash<text_encoding>
{
size_t
operator()(const text_encoding& __enc) const noexcept
{ return std::hash<text_encoding::id>()(__enc.mib()); }
};
class text_encoding::aliases_view::_Iterator
{
public:
using value_type = const char*;
using reference = const char*;
using difference_type = int;
constexpr _Iterator() = default;
constexpr value_type
operator*() const
{
if (_M_dereferenceable()) [[likely]]
return _M_rep->_M_name;
else
{
__glibcxx_assert(_M_dereferenceable());
return "";
}
}
constexpr _Iterator&
operator++()
{
if (_M_dereferenceable()) [[likely]]
++_M_rep;
else
{
__glibcxx_assert(_M_dereferenceable());
*this = _Iterator{};
}
return *this;
}
constexpr _Iterator&
operator--()
{
const bool __decrementable
= _M_rep != nullptr && _M_rep[-1]._M_id == _M_id;
if (__decrementable) [[likely]]
--_M_rep;
else
{
__glibcxx_assert(__decrementable);
*this = _Iterator{};
}
return *this;
}
constexpr _Iterator
operator++(int)
{
auto __it = *this;
++*this;
return __it;
}
constexpr _Iterator
operator--(int)
{
auto __it = *this;
--*this;
return __it;
}
constexpr value_type
operator[](difference_type __n) const
{ return *(*this + __n); }
constexpr _Iterator&
operator+=(difference_type __n)
{
if (_M_rep != nullptr)
{
if (__n > 0)
{
if (__n < (std::end(_S_reps) - _M_rep)
&& _M_rep[__n - 1]._M_id == _M_id) [[likely]]
_M_rep += __n;
else
*this == _Iterator{};
}
else if (__n < 0)
{
if (__n > (_S_reps - _M_rep)
&& _M_rep[__n]._M_id == _M_id) [[likely]]
_M_rep += __n;
else
*this == _Iterator{};
}
}
if (__n != 0)
__glibcxx_assert(_M_rep != nullptr);
return *this;
}
constexpr _Iterator&
operator-=(difference_type __n)
{
using _Traits = __gnu_cxx::__int_traits<difference_type>;
if (__n == _Traits::__min) [[unlikely]]
return operator+=(_Traits::__max);
return operator+=(-__n);
}
constexpr difference_type
operator-(const _Iterator& __i) const
{
if (_M_id == __i._M_id)
return _M_rep - __i._M_rep;
__glibcxx_assert(_M_id == __i._M_id);
return __gnu_cxx::__int_traits<difference_type>::__max;
}
constexpr bool
operator==(const _Iterator&) const = default;
constexpr bool
operator==(_Sentinel) const noexcept
{ return !_M_dereferenceable(); }
constexpr strong_ordering
operator<=>(const _Iterator& __i) const
{
__glibcxx_assert(_M_id == __i._M_id);
return _M_rep <=> __i._M_rep;
}
friend constexpr _Iterator
operator+(_Iterator __i, difference_type __n)
{
__i += __n;
return __i;
}
friend constexpr _Iterator
operator+(difference_type __n, _Iterator __i)
{
__i += __n;
return __i;
}
friend constexpr _Iterator
operator-(_Iterator __i, difference_type __n)
{
__i -= __n;
return __i;
}
private:
friend class text_encoding;
constexpr explicit
_Iterator(const _Rep* __r) noexcept
: _M_rep(__r), _M_id(__r ? __r->_M_id : 0)
{ }
constexpr bool
_M_dereferenceable() const noexcept
{ return _M_rep != nullptr && _M_rep->_M_id == _M_id; }
const _Rep* _M_rep = nullptr;
_Rep::id _M_id = 0;
};
constexpr auto
text_encoding::aliases_view::begin() const noexcept
-> _Iterator
{ return _Iterator(_M_begin); }
namespace ranges
{
// Opt-in to borrowed_range concept
template<>
inline constexpr bool
enable_borrowed_range<std::text_encoding::aliases_view> = true;
}
_GLIBCXX_END_NAMESPACE_VERSION
} // namespace std
#endif // __cpp_lib_text_encoding
#endif // _GLIBCXX_TEXT_ENCODING