unicode-muncher.pl: Updated to version 2.1 from GNU classpath.
2004-07-09 Michael Koch <konqueror@gmx.de> * scripts/unicode-muncher.pl: Updated to version 2.1 from GNU classpath. Added some clarifications on where to find the needed files from www.unicode.org. * gnu/gcj/convert/UnicodeCharacterDatabase-3.0.0.html, gnu/gcj/convert/UnicodeData-3.0.0.txt: Removed, these can directly be downloaded from www.unicode.org if needed. * gnu/java/lang/CharData.java: Regenerated. * include/java-chartables.h: Regenerated. * Makefile.am (ordinary_java_source_files): Removed gnu/java/lang/CharData.java. * Makefile.in: Regenerated. From-SVN: r84399
This commit is contained in:
parent
dae1dd2e3c
commit
1f33f6b4c7
8 changed files with 1753 additions and 12540 deletions
|
@ -1,3 +1,18 @@
|
||||||
|
2004-07-09 Michael Koch <konqueror@gmx.de>
|
||||||
|
|
||||||
|
* scripts/unicode-muncher.pl: Updated to version 2.1
|
||||||
|
from GNU classpath. Added some clarifications on where to find the
|
||||||
|
needed files from www.unicode.org.
|
||||||
|
* gnu/gcj/convert/UnicodeCharacterDatabase-3.0.0.html,
|
||||||
|
gnu/gcj/convert/UnicodeData-3.0.0.txt:
|
||||||
|
Removed, these can directly be downloaded from www.unicode.org if
|
||||||
|
needed.
|
||||||
|
* gnu/java/lang/CharData.java: Regenerated.
|
||||||
|
* include/java-chartables.h: Regenerated.
|
||||||
|
* Makefile.am (ordinary_java_source_files):
|
||||||
|
Removed gnu/java/lang/CharData.java.
|
||||||
|
* Makefile.in: Regenerated.
|
||||||
|
|
||||||
2004-07-09 Michael Koch <konqueror@gmx.de>
|
2004-07-09 Michael Koch <konqueror@gmx.de>
|
||||||
|
|
||||||
* java/security/AccessControlContext.java,
|
* java/security/AccessControlContext.java,
|
||||||
|
|
|
@ -2289,7 +2289,6 @@ gnu/java/io/NullOutputStream.java \
|
||||||
gnu/java/io/ObjectIdentityWrapper.java \
|
gnu/java/io/ObjectIdentityWrapper.java \
|
||||||
gnu/java/lang/ArrayHelper.java \
|
gnu/java/lang/ArrayHelper.java \
|
||||||
gnu/java/lang/ClassHelper.java \
|
gnu/java/lang/ClassHelper.java \
|
||||||
gnu/java/lang/CharData.java \
|
|
||||||
gnu/java/lang/MainThread.java \
|
gnu/java/lang/MainThread.java \
|
||||||
gnu/java/lang/reflect/TypeSignature.java \
|
gnu/java/lang/reflect/TypeSignature.java \
|
||||||
gnu/java/locale/Calendar.java \
|
gnu/java/locale/Calendar.java \
|
||||||
|
|
|
@ -1961,7 +1961,6 @@ gnu/java/io/NullOutputStream.java \
|
||||||
gnu/java/io/ObjectIdentityWrapper.java \
|
gnu/java/io/ObjectIdentityWrapper.java \
|
||||||
gnu/java/lang/ArrayHelper.java \
|
gnu/java/lang/ArrayHelper.java \
|
||||||
gnu/java/lang/ClassHelper.java \
|
gnu/java/lang/ClassHelper.java \
|
||||||
gnu/java/lang/CharData.java \
|
|
||||||
gnu/java/lang/MainThread.java \
|
gnu/java/lang/MainThread.java \
|
||||||
gnu/java/lang/reflect/TypeSignature.java \
|
gnu/java/lang/reflect/TypeSignature.java \
|
||||||
gnu/java/locale/Calendar.java \
|
gnu/java/locale/Calendar.java \
|
||||||
|
@ -3237,9 +3236,8 @@ DEP_FILES = .deps/$(srcdir)/$(CONVERT_DIR)/gen-from-JIS.P \
|
||||||
.deps/gnu/java/io/ClassLoaderObjectInputStream.P \
|
.deps/gnu/java/io/ClassLoaderObjectInputStream.P \
|
||||||
.deps/gnu/java/io/NullOutputStream.P \
|
.deps/gnu/java/io/NullOutputStream.P \
|
||||||
.deps/gnu/java/io/ObjectIdentityWrapper.P \
|
.deps/gnu/java/io/ObjectIdentityWrapper.P \
|
||||||
.deps/gnu/java/lang/ArrayHelper.P .deps/gnu/java/lang/CharData.P \
|
.deps/gnu/java/lang/ArrayHelper.P .deps/gnu/java/lang/ClassHelper.P \
|
||||||
.deps/gnu/java/lang/ClassHelper.P .deps/gnu/java/lang/MainThread.P \
|
.deps/gnu/java/lang/MainThread.P .deps/gnu/java/lang/natMainThread.P \
|
||||||
.deps/gnu/java/lang/natMainThread.P \
|
|
||||||
.deps/gnu/java/lang/reflect/TypeSignature.P \
|
.deps/gnu/java/lang/reflect/TypeSignature.P \
|
||||||
.deps/gnu/java/locale/Calendar.P .deps/gnu/java/locale/Calendar_de.P \
|
.deps/gnu/java/locale/Calendar.P .deps/gnu/java/locale/Calendar_de.P \
|
||||||
.deps/gnu/java/locale/Calendar_en.P .deps/gnu/java/locale/Calendar_nl.P \
|
.deps/gnu/java/locale/Calendar_en.P .deps/gnu/java/locale/Calendar_nl.P \
|
||||||
|
|
|
@ -1,345 +0,0 @@
|
||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
|
|
||||||
|
|
||||||
"http://www.w3.org/TR/REC-html40/loose.dtd">
|
|
||||||
|
|
||||||
<html>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<head>
|
|
||||||
|
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
|
||||||
|
|
||||||
<meta http-equiv="Content-Language" content="en-us">
|
|
||||||
|
|
||||||
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
|
|
||||||
|
|
||||||
<meta name="ProgId" content="FrontPage.Editor.Document">
|
|
||||||
|
|
||||||
<link rel="stylesheet" href="http://www.unicode.org/unicode.css" type="text/css">
|
|
||||||
|
|
||||||
<title>Unicode Character Database</title>
|
|
||||||
|
|
||||||
</head>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<h1>UNICODE CHARACTER DATABASE<br>
|
|
||||||
Version 3.0.0</h1>
|
|
||||||
|
|
||||||
<table border="1" cellspacing="2" cellpadding="0" height="87" width="100%">
|
|
||||||
|
|
||||||
<tr>
|
|
||||||
|
|
||||||
<td valign="TOP" width="144">Revision</td>
|
|
||||||
|
|
||||||
<td valign="TOP">3.0.0</td>
|
|
||||||
|
|
||||||
</tr>
|
|
||||||
|
|
||||||
<tr>
|
|
||||||
|
|
||||||
<td valign="TOP" width="144">Authors</td>
|
|
||||||
|
|
||||||
<td valign="TOP">Mark Davis and Ken Whistler</td>
|
|
||||||
|
|
||||||
</tr>
|
|
||||||
|
|
||||||
<tr>
|
|
||||||
|
|
||||||
<td valign="TOP" width="144">Date</td>
|
|
||||||
|
|
||||||
<td valign="TOP">1999-09-11</td>
|
|
||||||
|
|
||||||
</tr>
|
|
||||||
|
|
||||||
<tr>
|
|
||||||
|
|
||||||
<td valign="TOP" width="144">This Version</td>
|
|
||||||
|
|
||||||
<td valign="TOP"><a href="ftp://ftp.unicode.org/Public/3.0-Update/UnicodeCharacterDatabase-3.0.0.html">ftp://ftp.unicode.org/Public/3.0-Update/UnicodeCharacterDatabase-3.0.0.html</a></td>
|
|
||||||
|
|
||||||
</tr>
|
|
||||||
|
|
||||||
<tr>
|
|
||||||
|
|
||||||
<td valign="TOP" width="144">Previous Version</td>
|
|
||||||
|
|
||||||
<td valign="TOP">n/a</td>
|
|
||||||
|
|
||||||
</tr>
|
|
||||||
|
|
||||||
<tr>
|
|
||||||
|
|
||||||
<td valign="TOP" width="144">Latest Version</td>
|
|
||||||
|
|
||||||
<td valign="TOP"><a href="ftp://ftp.unicode.org/Public/3.0-Update/UnicodeCharacterDatabase-3.0.0.html">ftp://ftp.unicode.org/Public/3.0-Update/UnicodeCharacterDatabase-3.0.0.html</a></td>
|
|
||||||
|
|
||||||
</tr>
|
|
||||||
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<p align="center">Copyright © 1995-1999 Unicode, Inc. All Rights reserved.</p>
|
|
||||||
|
|
||||||
<h2>Disclaimer</h2>
|
|
||||||
|
|
||||||
<p>The Unicode Character Database is provided as is by Unicode, Inc. No claims
|
|
||||||
|
|
||||||
are made as to fitness for any particular purpose. No warranties of any kind are
|
|
||||||
|
|
||||||
expressed or implied. The recipient agrees to determine applicability of
|
|
||||||
|
|
||||||
information provided. If this file has been purchased on magnetic or optical
|
|
||||||
|
|
||||||
media from Unicode, Inc., the sole remedy for any claim will be exchange of
|
|
||||||
|
|
||||||
defective media within 90 days of receipt.</p>
|
|
||||||
|
|
||||||
<p>This disclaimer is applicable for all other data files accompanying the
|
|
||||||
|
|
||||||
Unicode Character Database, some of which have been compiled by the Unicode
|
|
||||||
|
|
||||||
Consortium, and some of which have been supplied by other sources.</p>
|
|
||||||
|
|
||||||
<h2>Limitations on Rights to Redistribute This Data</h2>
|
|
||||||
|
|
||||||
<p>Recipient is granted the right to make copies in any form for internal
|
|
||||||
|
|
||||||
distribution and to freely use the information supplied in the creation of
|
|
||||||
|
|
||||||
products supporting the Unicode<sup>TM</sup> Standard. The files in the Unicode
|
|
||||||
|
|
||||||
Character Database can be redistributed to third parties or other organizations
|
|
||||||
|
|
||||||
(whether for profit or not) as long as this notice and the disclaimer notice are
|
|
||||||
|
|
||||||
retained. Information can be extracted from these files and used in
|
|
||||||
|
|
||||||
documentation or programs, as long as there is an accompanying notice indicating
|
|
||||||
|
|
||||||
the source.</p>
|
|
||||||
|
|
||||||
<h2>Introduction</h2>
|
|
||||||
|
|
||||||
<p>The Unicode Character Database is a set of files that define the Unicode
|
|
||||||
|
|
||||||
character properties and internal mappings. For more information about character
|
|
||||||
|
|
||||||
properties and mappings, see <i><a href="http://www.unicode.org/unicode/uni2book/u2.html">The
|
|
||||||
|
|
||||||
Unicode Standard</a></i>.</p>
|
|
||||||
|
|
||||||
<p>The Unicode Character Database has been updated to reflect Version 3.0 of the
|
|
||||||
|
|
||||||
Unicode Standard, with many characters added to those published in Version 2.0.
|
|
||||||
|
|
||||||
A number of corrections have also been made to case mappings or other errors in
|
|
||||||
|
|
||||||
the database noted since the publication of Version 2.0. Normative bidirectional
|
|
||||||
|
|
||||||
properties have also been modified to reflect decisions of the Unicode Technical
|
|
||||||
|
|
||||||
Committee.</p>
|
|
||||||
|
|
||||||
<p>For more information on versions of the Unicode Standard and how to reference
|
|
||||||
|
|
||||||
them, see <a href="http://www.unicode.org/unicode/standard/versions/">http://www.unicode.org/unicode/standard/versions/</a>.</p>
|
|
||||||
|
|
||||||
<h2>Conformance</h2>
|
|
||||||
|
|
||||||
<p>Character properties may be either normative or informative. <i>Normative</i>
|
|
||||||
|
|
||||||
means that implementations that claim conformance to the Unicode Standard (at a
|
|
||||||
|
|
||||||
particular version) and which make use of a particular property or field must
|
|
||||||
|
|
||||||
follow the specifications of the standard for that property or field in order to
|
|
||||||
|
|
||||||
be conformant. The term <i>normative</i> when applied to a property or field of
|
|
||||||
|
|
||||||
the Unicode Character Database, does <i>not</i> mean that the value of that
|
|
||||||
|
|
||||||
field will never change. Corrections and extensions to the standard in the
|
|
||||||
|
|
||||||
future may require minor changes to normative values, even though the Unicode
|
|
||||||
|
|
||||||
Technical Committee strives to minimize such changes. An<i> informative </i>property
|
|
||||||
|
|
||||||
or field is strongly recommended, but a conformant implementation is free to use
|
|
||||||
|
|
||||||
or change such values as it may require while still being conformant to the
|
|
||||||
|
|
||||||
standard. Particular implementations may choose to override the properties and
|
|
||||||
|
|
||||||
mappings that are not normative. In that case, it is up to the implementer to
|
|
||||||
|
|
||||||
establish a protocol to convey that information.</p>
|
|
||||||
|
|
||||||
<h2>Files</h2>
|
|
||||||
|
|
||||||
<p>The following summarizes the files in the Unicode Character Database. For
|
|
||||||
|
|
||||||
more information about these files, see the referenced technical report or
|
|
||||||
|
|
||||||
section of Unicode Standard, Version 3.0.</p>
|
|
||||||
|
|
||||||
<p><b>UnicodeData.txt (Chapter 4)</b>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li>The main file in the Unicode Character Database.</li>
|
|
||||||
|
|
||||||
<li>For detailed information on the format, see <a href="UnicodeData.html">UnicodeData.html</a>.
|
|
||||||
|
|
||||||
This file also characterizes which properties are normative and which are
|
|
||||||
|
|
||||||
informative.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p><b>PropList.txt (Chapter 4)</b>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li>Additional informative properties list: <i>Alphabetic, Ideographic,</i>
|
|
||||||
|
|
||||||
and <i>Mathematical</i>, among others.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p><b>SpecialCasing.txt (Chapter 4)</b>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li>List of informative special casing properties, including one-to-many
|
|
||||||
|
|
||||||
mappings such as SHARP S => "SS", and locale-specific mappings,
|
|
||||||
|
|
||||||
such as for Turkish <i>dotless i</i>.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p><b>Blocks.txt (Chapter 14)</b>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li>List of normative block names.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p><b>Jamo.txt (Chapter 4)</b>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li>List of normative Jamo short names, used in deriving HANGUL SYLLABLE names
|
|
||||||
|
|
||||||
algorithmically.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p><b>ArabicShaping.txt (Section 8.2)</b>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li>Basic Arabic and Syriac character shaping properties, such as initial,
|
|
||||||
|
|
||||||
medial and final shapes. These properties are normative for minimal shaping
|
|
||||||
|
|
||||||
of Arabic and Syriac. </li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p><b>NamesList.txt (Chapter 14)</b>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li>This file duplicates some of the material in the UnicodeData file, and
|
|
||||||
|
|
||||||
adds informative annotations uses in the character charts, as printed in the
|
|
||||||
|
|
||||||
Unicode Standard. </li>
|
|
||||||
|
|
||||||
<li><b>Note: </b>The information in NamesList.txt and Index.txt files matches
|
|
||||||
|
|
||||||
the appropriate version of the book. Changes in the Unicode Character
|
|
||||||
|
|
||||||
Database since then may not be reflected in these files, since they are
|
|
||||||
|
|
||||||
primarily of archival interest.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p><b>Index.txt (Chapter 14)</b>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li>Informative index to Unicode characters, as printed in the Unicode
|
|
||||||
|
|
||||||
Standard</li>
|
|
||||||
|
|
||||||
<li><b>Note: </b>The information in NamesList.txt and Index.txt files matches
|
|
||||||
|
|
||||||
the appropriate version of the book. Changes in the Unicode Character
|
|
||||||
|
|
||||||
Database since then may not be reflected in these files, since they are
|
|
||||||
|
|
||||||
primarily of archival interest.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p><b>CompositionExclusions.txt (<a href="http://www.unicode.org/unicode/reports/tr15/">UTR#15
|
|
||||||
|
|
||||||
Unicode Normalization Forms</a>)</b>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li>Normative properties for normalization.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p><b>LineBreak.txt (<a href="http://www.unicode.org/unicode/reports/tr14/">UTR
|
|
||||||
|
|
||||||
#14: Line Breaking Properties</a>)</b>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li>Normative and informative properties for line breaking. To see which
|
|
||||||
|
|
||||||
properties are informative and which are normative, consult UTR#14.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p><b>EastAsianWidth.txt (<a href="http://www.unicode.org/unicode/reports/tr11/">UTR
|
|
||||||
|
|
||||||
#11: East Asian Character Width</a>)</b>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li>Informative properties for determining the choice of wide vs. narrow
|
|
||||||
|
|
||||||
glyphs in East Asian contexts.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p><b>diffXvY.txt</b>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li>Mechanically-generated informative files containing accumulated
|
|
||||||
|
|
||||||
differences between successive versions of UnicodeData.txt</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</body>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</html>
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,6 +1,6 @@
|
||||||
#!/usr/bin/perl -w
|
#!/usr/bin/perl -w
|
||||||
# unicode-muncher.pl -- generate Unicode database for java.lang.Character
|
# unicode-muncher.pl -- generate Unicode database for java.lang.Character
|
||||||
# Copyright (C) 1998, 2002 Free Software Foundation, Inc.
|
# Copyright (C) 1998, 2002, 2004 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is part of GNU Classpath.
|
# This file is part of GNU Classpath.
|
||||||
#
|
#
|
||||||
|
@ -36,18 +36,22 @@
|
||||||
# obligated to do so. If you do not wish to do so, delete this
|
# obligated to do so. If you do not wish to do so, delete this
|
||||||
# exception statement from your version.
|
# exception statement from your version.
|
||||||
|
|
||||||
# Code for reading UnicodeData.txt and generating the code for
|
# Code for reading UnicodeData-3.0.0.txt and SpecialCasing-2.txt to generate
|
||||||
# gnu.java.lang.CharData. For now, the relevant Unicode definition files
|
# the code for gnu.java.lang.CharData. The relevant files can be found here:
|
||||||
# are found in libjava/gnu/gcj/convert/.
|
#
|
||||||
|
# http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.txt
|
||||||
|
# http://www.unicode.org/Public/3.0-Update/SpecialCasing-2.txt
|
||||||
#
|
#
|
||||||
# Inspired by code from Jochen Hoenicke.
|
# Inspired by code from Jochen Hoenicke.
|
||||||
# author Eric Blake <ebb9@email.byu.edu>
|
# author Eric Blake <ebb9@email.byu.edu>
|
||||||
#
|
#
|
||||||
# Usage: ./unicode-muncher <UnicodeData.txt> <CharData.java>
|
# Usage: ./unicode-muncher <UnicodeData.txt> <SpecialCasing> <CharData.java>
|
||||||
# where <UnicodeData.txt> is obtained from www.unicode.org (named
|
# where <UnicodeData.txt> is obtained from www.unicode.org (named
|
||||||
# UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
|
# UnicodeData-3.0.0.txt for Unicode version 3.0.0), <SpecialCasing>
|
||||||
# is the final location for the Java interface gnu.java.lang.CharData.
|
# is obtained from www.unicode too (named SpecialCasing-2.txt for Unicode
|
||||||
# As of JDK 1.4, use Unicode version 3.0.0 for best results.
|
# version 3.0.0), and <CharData.java> is the final location for the Java
|
||||||
|
# interface gnu.java.lang.CharData. As of JDK 1.4, use Unicode version 3.0.0
|
||||||
|
# for best results.
|
||||||
|
|
||||||
##
|
##
|
||||||
## Convert a 16-bit integer to a Java source code String literal character
|
## Convert a 16-bit integer to a Java source code String literal character
|
||||||
|
@ -75,20 +79,42 @@ my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
|
||||||
my $NOBREAK_FLAG = 32;
|
my $NOBREAK_FLAG = 32;
|
||||||
my $MIRRORED_FLAG = 64;
|
my $MIRRORED_FLAG = 64;
|
||||||
|
|
||||||
|
my %special = ();
|
||||||
my @info = ();
|
my @info = ();
|
||||||
my $titlecase = "";
|
my $titlecase = "";
|
||||||
my $count = 0;
|
my $count = 0;
|
||||||
my $range = 0;
|
my $range = 0;
|
||||||
|
|
||||||
die "Usage: $0 <UnicodeData.txt> <CharData.java>" unless @ARGV == 2;
|
die "Usage: $0 <UnicodeData.txt> <SpecialCasing.txt> <CharData.java>"
|
||||||
open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
|
unless @ARGV == 3;
|
||||||
|
$| = 1;
|
||||||
|
print "GNU Classpath Unicode Attribute Database Generator 2.1\n";
|
||||||
|
print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
|
||||||
|
|
||||||
|
# Stage 0: Parse the special casing file
|
||||||
|
print "Parsing special casing file\n";
|
||||||
|
open (SPECIAL, "< $ARGV[1]") || die "Can't open special casing file: $!\n";
|
||||||
|
while (<SPECIAL>) {
|
||||||
|
next if /^\#/;
|
||||||
|
my ($ch, undef, undef, $upper) = split / *; */;
|
||||||
|
|
||||||
|
# This grabs only the special casing for multi-char uppercase. Note that
|
||||||
|
# there are no multi-char lowercase, and that Sun ignores multi-char
|
||||||
|
# titlecase rules. This script omits 3 special cases in Unicode 3.0.0,
|
||||||
|
# which must be hardcoded in java.lang.String:
|
||||||
|
# \u03a3 (Sun ignores this special case)
|
||||||
|
# \u0049 - lowercases to \u0131, but only in Turkish locale
|
||||||
|
# \u0069 - uppercases to \u0130, but only in Turkish locale
|
||||||
|
next unless defined $upper and $upper =~ / /;
|
||||||
|
$special{hex $ch} = [map {hex} split ' ', $upper];
|
||||||
|
}
|
||||||
|
|
||||||
|
close SPECIAL;
|
||||||
|
|
||||||
# Stage 1: Parse the attribute file
|
# Stage 1: Parse the attribute file
|
||||||
$| = 1;
|
|
||||||
print "GNU Classpath Unicode Attribute Database Generator 2.0\n";
|
|
||||||
print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
|
|
||||||
print "Parsing attributes file";
|
print "Parsing attributes file";
|
||||||
while(<UNICODE>) {
|
open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
|
||||||
|
while (<UNICODE>) {
|
||||||
print "." unless $count++ % 1000;
|
print "." unless $count++ % 1000;
|
||||||
chomp;
|
chomp;
|
||||||
s/\r//g;
|
s/\r//g;
|
||||||
|
@ -142,6 +168,8 @@ while(<UNICODE>) {
|
||||||
last;
|
last;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
$direction <<= 2;
|
||||||
|
$direction += $#{$special{$ch}} if defined $special{$ch};
|
||||||
|
|
||||||
if ($range) {
|
if ($range) {
|
||||||
die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
|
die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
|
||||||
|
@ -167,9 +195,7 @@ my @charinfo = ();
|
||||||
|
|
||||||
for my $ch (0 .. 0xffff) {
|
for my $ch (0 .. 0xffff) {
|
||||||
print "." unless $count++ % 0x1000;
|
print "." unless $count++ % 0x1000;
|
||||||
if (! defined $info[$ch]) {
|
$info[$ch] = pack("n5", 0, -1, 0, 0, -4) unless defined $info[$ch];
|
||||||
$info[$ch] = pack("n5", 0, -1, 0, 0, -1);
|
|
||||||
}
|
|
||||||
|
|
||||||
my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info[$ch]);
|
my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info[$ch]);
|
||||||
if (! exists $charhash{$info[$ch]}) {
|
if (! exists $charhash{$info[$ch]}) {
|
||||||
|
@ -209,7 +235,7 @@ for my $i (3 .. 8) {
|
||||||
for ($j = $blksize - 1; $j > 0; $j--) {
|
for ($j = $blksize - 1; $j > 0; $j--) {
|
||||||
my %tails = ();
|
my %tails = ();
|
||||||
for $k (0 .. $#blkarray) {
|
for $k (0 .. $#blkarray) {
|
||||||
next if ! defined $blkarray[$k];
|
next unless defined $blkarray[$k];
|
||||||
my $len = length $blkarray[$k];
|
my $len = length $blkarray[$k];
|
||||||
my $tail = substr $blkarray[$k], $len - $j * 2;
|
my $tail = substr $blkarray[$k], $len - $j * 2;
|
||||||
if (exists $tails{$tail}) {
|
if (exists $tails{$tail}) {
|
||||||
|
@ -222,12 +248,12 @@ for my $i (3 .. 8) {
|
||||||
# tails are calculated, now calculate the heads and merge.
|
# tails are calculated, now calculate the heads and merge.
|
||||||
BLOCK:
|
BLOCK:
|
||||||
for $k (0 .. $#blkarray) {
|
for $k (0 .. $#blkarray) {
|
||||||
next if ! defined $blkarray[$k];
|
next unless defined $blkarray[$k];
|
||||||
my $tomerge = $k;
|
my $tomerge = $k;
|
||||||
while (1) {
|
while (1) {
|
||||||
my $head = substr($blkarray[$tomerge], 0, $j * 2);
|
my $head = substr($blkarray[$tomerge], 0, $j * 2);
|
||||||
my $entry = $tails{$head};
|
my $entry = $tails{$head};
|
||||||
next BLOCK if ! defined $entry;
|
next BLOCK unless defined $entry;
|
||||||
|
|
||||||
my $other = shift @{$entry};
|
my $other = shift @{$entry};
|
||||||
if ($other == $tomerge) {
|
if ($other == $tomerge) {
|
||||||
|
@ -297,10 +323,10 @@ die "UTF-8 limit of blocks may be exceeded: " . scalar(@blocks) . "\n"
|
||||||
die "UTF-8 limit of data may be exceeded: " . length($bestblkstr) . "\n"
|
die "UTF-8 limit of data may be exceeded: " . length($bestblkstr) . "\n"
|
||||||
if length($bestblkstr) > 0xffff / 3;
|
if length($bestblkstr) > 0xffff / 3;
|
||||||
{
|
{
|
||||||
print "Generating $ARGV[1] with shift of $bestshift";
|
print "Generating $ARGV[2] with shift of $bestshift";
|
||||||
my ($i, $j);
|
my ($i, $j);
|
||||||
|
|
||||||
open OUTPUT, "> $ARGV[1]" or die "Failed creating output file: $!\n";
|
open OUTPUT, "> $ARGV[2]" or die "Failed creating output file: $!\n";
|
||||||
print OUTPUT <<EOF;
|
print OUTPUT <<EOF;
|
||||||
/* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
|
/* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
|
||||||
Copyright (C) 2002 Free Software Foundation, Inc.
|
Copyright (C) 2002 Free Software Foundation, Inc.
|
||||||
|
@ -345,8 +371,9 @@ package gnu.java.lang;
|
||||||
/**
|
/**
|
||||||
* This contains the info about the unicode characters, that
|
* This contains the info about the unicode characters, that
|
||||||
* java.lang.Character needs. It is generated automatically from
|
* java.lang.Character needs. It is generated automatically from
|
||||||
* <code>$ARGV[0]</code>, by some
|
* <code>$ARGV[0]</code> and
|
||||||
* perl scripts. This Unicode definition file can be found on the
|
* <code>$ARGV[1]</code>, by some
|
||||||
|
* perl scripts. These Unicode definition files can be found on the
|
||||||
* <a href="http://www.unicode.org">http://www.unicode.org</a> website.
|
* <a href="http://www.unicode.org">http://www.unicode.org</a> website.
|
||||||
* JDK 1.4 uses Unicode version 3.0.0.
|
* JDK 1.4 uses Unicode version 3.0.0.
|
||||||
*
|
*
|
||||||
|
@ -358,13 +385,18 @@ package gnu.java.lang;
|
||||||
* into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
|
* into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
|
||||||
* <code>NUM_VALUE</code>, and <code>DIRECTION</code>. Notice that the
|
* <code>NUM_VALUE</code>, and <code>DIRECTION</code>. Notice that the
|
||||||
* attribute tables are much smaller than 0xffff entries; as many characters
|
* attribute tables are much smaller than 0xffff entries; as many characters
|
||||||
* in Unicode share common attributes. Finally, there is a listing for
|
* in Unicode share common attributes. The DIRECTION table also contains
|
||||||
* <code>TITLE</code> exceptions (most characters just have the same
|
* a field for detecting characters with multi-character uppercase expansions.
|
||||||
* title case as upper case).
|
* Next, there is a listing for <code>TITLE</code> exceptions (most characters
|
||||||
|
* just have the same title case as upper case). Finally, there are two
|
||||||
|
* tables for multi-character capitalization, <code>UPPER_SPECIAL</code>
|
||||||
|
* which lists the characters which are special cased, and
|
||||||
|
* <code>UPPER_EXPAND</code>, which lists their expansion.
|
||||||
*
|
*
|
||||||
* \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
|
* \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
|
||||||
* Eric Blake)
|
* Eric Blake)
|
||||||
* \@see Character
|
* \@see Character
|
||||||
|
* \@see String
|
||||||
*/
|
*/
|
||||||
public interface CharData
|
public interface CharData
|
||||||
{
|
{
|
||||||
|
@ -417,7 +449,7 @@ EOF
|
||||||
print OUTPUT $i ? "\n + \"" : " = \"";
|
print OUTPUT $i ? "\n + \"" : " = \"";
|
||||||
for $j (0 .. 10) {
|
for $j (0 .. 10) {
|
||||||
last if $len <= $i * 11 + $j;
|
last if $len <= $i * 11 + $j;
|
||||||
my $val = unpack "n", substr($bestblkstr, 2 * ($i*11 + $j), 2);
|
my $val = unpack "n", substr($bestblkstr, 2 * ($i * 11 + $j), 2);
|
||||||
print OUTPUT javaChar($val);
|
print OUTPUT javaChar($val);
|
||||||
}
|
}
|
||||||
print OUTPUT "\"";
|
print OUTPUT "\"";
|
||||||
|
@ -451,10 +483,12 @@ EOF
|
||||||
;
|
;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is the attribute table for computing the uppercase representation
|
* This is the attribute table for computing the single-character uppercase
|
||||||
* of a character. The value is the signed difference between the
|
* representation of a character. The value is the signed difference
|
||||||
* character and its uppercase version. Note that this is stored as an
|
* between the character and its uppercase version. Note that this is
|
||||||
* unsigned char since this is a String literal.
|
* stored as an unsigned char since this is a String literal. When
|
||||||
|
* capitalizing a String, you must first check if a multi-character uppercase
|
||||||
|
* sequence exists before using this character.
|
||||||
*/
|
*/
|
||||||
String UPPER
|
String UPPER
|
||||||
EOF
|
EOF
|
||||||
|
@ -483,11 +517,11 @@ EOF
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
$len = @charinfo;
|
$len = @charinfo;
|
||||||
for ($i = 0; $i < $len / 11; $i++) {
|
for ($i = 0; $i < $len / 13; $i++) {
|
||||||
print OUTPUT $i ? "\n + \"" : " = \"";
|
print OUTPUT $i ? "\n + \"" : " = \"";
|
||||||
for $j (0 .. 10) {
|
for $j (0 .. 12) {
|
||||||
last if $len <= $i * 11 + $j;
|
last if $len <= $i * 13 + $j;
|
||||||
my $val = $charinfo[$i * 11 + $j][2];
|
my $val = $charinfo[$i * 13 + $j][2];
|
||||||
print OUTPUT javaChar($val);
|
print OUTPUT javaChar($val);
|
||||||
}
|
}
|
||||||
print OUTPUT "\"";
|
print OUTPUT "\"";
|
||||||
|
@ -498,19 +532,25 @@ EOF
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is the attribute table for computing the directionality class
|
* This is the attribute table for computing the directionality class
|
||||||
* of a character. At present, the value is in the range 0 - 18 if the
|
* of a character, as well as a marker of characters with a multi-character
|
||||||
* character has a direction, otherwise it is -1. Note that this is
|
* capitalization. The direction is taken by performing a signed shift
|
||||||
* stored as an unsigned char since this is a String literal.
|
* right by 2 (where a result of -1 means an unknown direction, such as
|
||||||
|
* for undefined characters). The lower 2 bits form a count of the
|
||||||
|
* additional characters that will be added to a String when performing
|
||||||
|
* multi-character uppercase expansion. This count is also used, along with
|
||||||
|
* the offset in UPPER_SPECIAL, to determine how much of UPPER_EXPAND to use
|
||||||
|
* when performing the case conversion. Note that this information is stored
|
||||||
|
* as an unsigned char since this is a String literal.
|
||||||
*/
|
*/
|
||||||
String DIRECTION
|
String DIRECTION
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
$len = @charinfo;
|
$len = @charinfo;
|
||||||
for ($i = 0; $i < $len / 11; $i++) {
|
for ($i = 0; $i < $len / 17; $i++) {
|
||||||
print OUTPUT $i ? "\n + \"" : " = \"";
|
print OUTPUT $i ? "\n + \"" : " = \"";
|
||||||
for $j (0 .. 10) {
|
for $j (0 .. 16) {
|
||||||
last if $len <= $i * 11 + $j;
|
last if $len <= $i * 17 + $j;
|
||||||
my $val = $charinfo[$i * 11 + $j][3];
|
my $val = $charinfo[$i * 17 + $j][3];
|
||||||
print OUTPUT javaChar($val);
|
print OUTPUT javaChar($val);
|
||||||
}
|
}
|
||||||
print OUTPUT "\"";
|
print OUTPUT "\"";
|
||||||
|
@ -520,10 +560,10 @@ EOF
|
||||||
;
|
;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is the listing of titlecase special cases (all other character
|
* This is the listing of titlecase special cases (all other characters
|
||||||
* can use <code>UPPER</code> to determine their titlecase). The listing
|
* can use <code>UPPER</code> to determine their titlecase). The listing
|
||||||
* is a sequence of character pairs; converting the first character of the
|
* is a sorted sequence of character pairs; converting the first character
|
||||||
* pair to titlecase produces the second character.
|
* of the pair to titlecase produces the second character.
|
||||||
*/
|
*/
|
||||||
String TITLE
|
String TITLE
|
||||||
EOF
|
EOF
|
||||||
|
@ -533,7 +573,64 @@ EOF
|
||||||
print OUTPUT $i ? "\n + \"" : " = \"";
|
print OUTPUT $i ? "\n + \"" : " = \"";
|
||||||
for $j (0 .. 10) {
|
for $j (0 .. 10) {
|
||||||
last if $len <= $i * 11 + $j;
|
last if $len <= $i * 11 + $j;
|
||||||
my $val = unpack "n", substr($titlecase, 2 * ($i*11 + $j), 2);
|
my $val = unpack "n", substr($titlecase, 2 * ($i * 11 + $j), 2);
|
||||||
|
print OUTPUT javaChar($val);
|
||||||
|
}
|
||||||
|
print OUTPUT "\"";
|
||||||
|
}
|
||||||
|
|
||||||
|
print OUTPUT <<EOF;
|
||||||
|
;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is a listing of characters with multi-character uppercase sequences.
|
||||||
|
* A character appears in this list exactly when it has a non-zero entry
|
||||||
|
* in the low-order 2-bit field of DIRECTION. The listing is a sorted
|
||||||
|
* sequence of pairs (hence a binary search on the even elements is an
|
||||||
|
* efficient way to lookup a character). The first element of a pair is the
|
||||||
|
* character with the expansion, and the second is the index into
|
||||||
|
* UPPER_EXPAND where the expansion begins. Use the 2-bit field of
|
||||||
|
* DIRECTION to determine where the expansion ends.
|
||||||
|
*/
|
||||||
|
String UPPER_SPECIAL
|
||||||
|
EOF
|
||||||
|
|
||||||
|
my @list = sort {$a <=> $b} keys %special;
|
||||||
|
my $expansion = "";
|
||||||
|
my $offset = 0;
|
||||||
|
$len = @list;
|
||||||
|
for ($i = 0; $i < $len / 5; $i++) {
|
||||||
|
print OUTPUT $i ? "\n + \"" : " = \"";
|
||||||
|
for $j (0 .. 4) {
|
||||||
|
last if $len <= $i * 5 + $j;
|
||||||
|
my $ch = $list[$i * 5 + $j];
|
||||||
|
print OUTPUT javaChar($ch);
|
||||||
|
print OUTPUT javaChar($offset);
|
||||||
|
$offset += @{$special{$ch}};
|
||||||
|
$expansion .= pack "n*", @{$special{$ch}};
|
||||||
|
}
|
||||||
|
print OUTPUT "\"";
|
||||||
|
}
|
||||||
|
|
||||||
|
print OUTPUT <<EOF;
|
||||||
|
;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is the listing of special case multi-character uppercase sequences.
|
||||||
|
* Characters listed in UPPER_SPECIAL index into this table to find their
|
||||||
|
* uppercase expansion. Remember that you must also perform special-casing
|
||||||
|
* on two single-character sequences in the Turkish locale, which are not
|
||||||
|
* covered here in CharData.
|
||||||
|
*/
|
||||||
|
String UPPER_EXPAND
|
||||||
|
EOF
|
||||||
|
|
||||||
|
$len = length($expansion) / 2;
|
||||||
|
for ($i = 0; $i < $len / 11; $i++) {
|
||||||
|
print OUTPUT $i ? "\n + \"" : " = \"";
|
||||||
|
for $j (0 .. 10) {
|
||||||
|
last if $len <= $i * 11 + $j;
|
||||||
|
my $val = unpack "n", substr($expansion, 2 * ($i * 11 + $j), 2);
|
||||||
print OUTPUT javaChar($val);
|
print OUTPUT javaChar($val);
|
||||||
}
|
}
|
||||||
print OUTPUT "\"";
|
print OUTPUT "\"";
|
||||||
|
|
Loading…
Add table
Reference in a new issue