Initial revision

From-SVN: r26263
This commit is contained in:
Tom Tromey 1999-04-07 14:42:40 +00:00
parent 140fa895c6
commit ee9dd3721b
370 changed files with 173494 additions and 0 deletions

View file

@ -0,0 +1,105 @@
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.convert;
public abstract class BytesToUnicode
{
/** Buffer to read bytes from.
* The characters inbuffer[inpos] ... inbuffer[inlength-1] are available. */
public byte[] inbuffer;
/** Starting index in buffer to read bytes from. */
public int inpos;
/** End of valid bytes in buffer. */
public int inlength;
static Class defaultDecodingClass;
static synchronized void getDefaultDecodingClass()
{
// Test (defaultDecodingClass == null) again in case of race condition.
if (defaultDecodingClass == null)
{
String encoding = System.getProperty("file.encoding");
String className = "gnu.gcj.convert.Input_"+encoding;
try
{
defaultDecodingClass = Class.forName(className);
}
catch (ClassNotFoundException ex)
{
throw new NoClassDefFoundError("missing default encoding "
+ encoding + " (class "
+ className + " not found)");
}
}
}
public abstract String getName();
public static BytesToUnicode getDefaultDecoder()
{
try
{
if (defaultDecodingClass == null)
getDefaultDecodingClass();
return (BytesToUnicode) defaultDecodingClass.newInstance();
}
catch (Throwable ex)
{
return new Input_8859_1();
}
}
/** Get a byte-stream->char-stream converter given an encoding name. */
public static BytesToUnicode getDecoder (String encoding)
throws java.io.UnsupportedEncodingException
{
String className = "gnu.gcj.convert.Input_"+encoding;
Class decodingClass;
try
{
decodingClass = Class.forName(className);
return (BytesToUnicode) decodingClass.newInstance();
}
catch (Throwable ex)
{
throw new java.io.UnsupportedEncodingException(encoding
+ " (" + ex + ')');
}
}
/** Make input bytes available to the conversion.
* @param buffer source of input bytes
* @param pos index of first available byte
* @param length one more than index of last available byte
*/
public final void setInput(byte[] buffer, int pos, int length)
{
inbuffer = buffer;
inpos = pos;
inlength = length;
}
/** Convert bytes to chars.
* Input bytes are taken from this.inbuffer. The available input
* bytes start at inbuffer[inpos], and end at inbuffer[inlength-1].
* @param outbuffer buffer for the converted character
* @param outpos position in buffer to start putting converted characters
* @param outlength the maximum number of characters to read
* @return number of chars placed in outbuffer.
* Also, this.inpos is incremented by the number of bytes consumed.
*
* (Note the asymmetry in that the input upper bound is inbuffer[inlength-1],
* while the output upper bound is outbuffer[outpos+outlength-1]. The
* justification is that inlength is like the count field of a
* BufferedInputStream, while the outlength parameter is like the
* length parameter of a read request.)
*/
public abstract int read (char[] outbuffer, int outpos, int outlength);
}

View file

@ -0,0 +1,151 @@
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.convert;
import java.io.*;
public class Convert
{
static void error (String message)
{
System.err.print("convert: ");
System.err.println(message);
System.err.println("Usage: convert [--from srcEncoding] [--to dstEncoding]");
System.err.println(" [inputfile [outputfile]]");
System.exit(-1);
}
static void missing (String arg)
{
error("missing arg after `" + arg + "' option");
}
public static void main (String[] args)
{
String inName = "-";
String outName = "-";
String inEncodingName = null;
String outEncodingName = "JavaSrc";
int seenNames = 0;
boolean reverse = false;
for (int i = 0; i < args.length; i++)
{
String arg = args[i];
if (arg.length() == 0)
error("zero-length argument");
if (arg.charAt(0) == '-')
{
if (arg.equals("-encoding") || arg.equals("--encoding")
|| args.equals("-from") || arg.equals("--from"))
{
if (++i == args.length) missing(arg);
inEncodingName = args[i];
}
else if (arg.equals("-to") || arg.equals("--to"))
{
if (++i == args.length) missing(arg);
outEncodingName = args[i];
}
else if (arg.equals("-i"))
{
if (++i == args.length) missing(arg);
inName = args[i];
}
else if (arg.equals("-o"))
{
if (++i == args.length) missing(arg);
outName = args[i];
}
else if (arg.equals("-reverse") || arg.equals("--reverse"))
{
reverse = true;
}
else if (arg.equals("-"))
{
switch (seenNames)
{
case 0:
inName = "-";
seenNames++;
break;
case 1:
outName = "-";
seenNames++;
break;
default:
error("too many `-' arguments");
}
}
else
error("unrecognized argument `" + arg + "'");
}
else
{
switch (seenNames)
{
case 0:
inName = arg;
seenNames++;
break;
case 1:
outName = arg;
seenNames++;
break;
default:
error("too many filename arguments");
}
}
}
if (reverse)
{
String tmp = inEncodingName;
inEncodingName = outEncodingName;
outEncodingName = tmp;
}
try
{
BytesToUnicode inDecoder
= inEncodingName == null ? BytesToUnicode.getDefaultDecoder()
: BytesToUnicode.getDecoder(inEncodingName);
UnicodeToBytes outEncoder
= outEncodingName == null ? UnicodeToBytes.getDefaultEncoder()
: UnicodeToBytes.getEncoder(outEncodingName);
InputStream inStream = inName == "-" ? System.in
: new FileInputStream(inName);
OutputStream outStream;
if (outName == "-")
outStream = System.out;
else
outStream = new FileOutputStream(outName);
InputStreamReader in
= new InputStreamReader(inStream, inEncodingName);
OutputStreamWriter out
= new OutputStreamWriter(outStream, outEncodingName);
char[] buffer = new char[2048];
for (;;)
{
int count = in.read(buffer);
if (count < 0)
break;
out.write(buffer, 0, count);
}
in.close();
out.close();
}
catch (java.io.IOException ex)
{
System.err.print("convert exception: ");
System.err.println(ex);
System.exit(-1);
}
}
}

View file

@ -0,0 +1,32 @@
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.convert;
public class Input_8859_1 extends BytesToUnicode
{
public String getName() { return "8859_1"; }
public int read (char[] outbuffer, int outpos, int outlength)
{
int origpos = outpos;
// Make sure fields of this are in registers.
int inpos = this.inpos;
byte[] inbuffer = this.inbuffer;
int inavail = this.inlength - inpos;
int outavail = outlength - outpos;
if (outavail > inavail)
outavail = inavail;
while (--outavail >= 0)
{
outbuffer[outpos++] = (char) (inbuffer[inpos++] & 0xFF);
}
this.inpos = inpos;
return outpos - origpos;
}
}

View file

@ -0,0 +1,19 @@
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.convert;
public class Input_EUCJIS extends BytesToUnicode
{
public String getName() { return "EUCJIS"; }
int codeset = 0;
int first_byte;
public native int read (char[] outbuffer, int outpos, int outlength);
}

View file

@ -0,0 +1,107 @@
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.convert;
public class Input_UTF8 extends BytesToUnicode
{
public String getName() { return "UTF8"; }
int partial = 0;
int partial_bytes_expected = 0;
//int suggogate_second = -1;
public int read (char[] outbuffer, int outpos, int outlength)
{
int origpos = outpos;
for (;;)
{
if (outpos >= outlength)
break;
if (inpos >= inlength)
break;
int b = inbuffer[inpos++];
if (b >= 0)
outbuffer[outpos++] = (char) b;
else
{
if ((b & 0xC0) == 0x80) // Continuation byte
{
partial = (partial << 6) | (b & 0x3F);
--partial_bytes_expected;
if (partial_bytes_expected == 1)
{
if (partial > (0xFFFF>>6))
{
// The next continuation byte will cause the result
// to exceed 0xFFFF, so we must use a surrogate pair.
// The "Unicode scalar value" (see D28 in section 3.7
// of the Unicode Standard 2.0) is defined as:
// value == (hi-0xD800)*0x400+(lo-0xDC00)+0x10000,
// where (hi, lo) is the Unicode surrogate pair.
// After reading the first three bytes, we have:
// partial == (value >> 6).
// Substituting and simplifying, we get:
// partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400.
// The definition lo>=0xDC00 && lo<=0xDFFF implies
// that (lo-0xDC00)>>6 is in the range 0..15.
// Hence we can infer (partial-0x400)>>4 == (hi-0xDB00)
// and we can emit the high-surrogate without waiting
// for the final byte:
outbuffer[outpos++] = (char) (0xDA00+(partial>>4));
// Now we want to set it up so that when we read
// the final byte on the next iteration, we will
// get the low-surrogate without special handling.
// I.e. we want:
// lo == (next_partial << 6) | (next & 0x3F)
// where next is the next input byte and next_partial
// is the value of partial at the end of this
// iteration. This implies: next_partial == lo >> 6.
// We can simplify the previous:
// partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400,
// to: partial == (hi-0xD800)*0x10+(lo>>6)+0x90.
// Inserting the values of hi and next_partial,
// and simplifying, we get: partial ==
// ( (partial-0x400)&~0xF) + next_partial + 0x90.
// Solving for next_partial, we get:
// next_partial = partial+0x400-0x90-(partial&~0xF):
// or: next_partial = (partial&0xF) + 0x370. Hence:
partial = (partial & 0xF) + 0x370;
}
}
else if (partial_bytes_expected == 0)
{
outbuffer[outpos++] = (char) partial;
partial = 0;
partial_bytes_expected = 0;
}
}
else // prefix byte
{
if ((b & 0xE) == 0xC0)
{
partial = b & 0x1F;
partial_bytes_expected = 1;
}
else if ((b & 0xF) == 0xF0)
{
partial = b & 0xF;
partial_bytes_expected = 2;
}
else
{
partial = b & 7;
partial_bytes_expected = 3;
}
}
}
}
return outpos - origpos;
}
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,31 @@
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.convert;
public class Output_8859_1 extends UnicodeToBytes
{
public String getName() { return "8859_1"; }
/**
* @return number of chars converted. */
public int write (char[] inbuffer, int inpos, int inlength)
{
int count = this.count;
byte[] buf = this.buf;
int avail = buf.length - count;
if (inlength > avail)
inlength = avail;
for (int i = inlength; --i >= 0; )
{
buf[count++] = (byte) inbuffer[inpos++];
}
this.count = count;
return inlength;
}
}

View file

@ -0,0 +1,82 @@
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.convert;
/** Convert Unicode to Ascii with \ u XXXX-escapes. */
public class Output_JavaSrc extends UnicodeToBytes
{
public String getName() { return "JavaSrc"; }
// Number of bytes remaining before pending_char has been written.
int todo;
int pending_char;
public int write (char[] inbuffer, int inpos, int inlength)
{
int start_pos = inpos;
int avail = buf.length - count;
for (;;)
{
if (avail == 0)
break;
switch (todo)
{
case 1:
if (pending_char == '\\')
{
buf[count++] = (byte) '\\';
avail--;
todo = 0;
continue;
}
/* ... else fall through ... */
case 2:
case 3:
case 4:
todo--;
int digit = ((int) pending_char >> (todo * 4)) & 0xF;
buf[count++] = (byte) Character.forDigit(digit, 16);
avail--;
continue;
case 5:
buf[count++] = (byte) 'u';
avail--;
todo = 4;
continue;
default:
;
}
if (inlength == 0)
break;
char ch = inbuffer[inpos++];
inlength--;
if (ch == '\\')
{
buf[count++] = (byte) '\\';
pending_char = ch;
todo = 1;
avail--;
}
else if (ch < 127)
{
buf[count++] = (byte) ch;
avail--;
}
else
{
buf[count++] = (byte) '\\';
pending_char = ch;
todo = 5;
avail--;
}
}
return inpos - start_pos;
}
}

View file

@ -0,0 +1,108 @@
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.convert;
public class Output_UTF8 extends UnicodeToBytes
{
public String getName() { return "UTF8"; }
/** True if a surrogate pair should be emitted as a single UTF8 sequence.
* Otherwise, a surrogate pair is treated as two separate characters.
* Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */
public boolean standardUTF8;
// Saves the previous char if it was a high-surrogate.
char hi_part;
// Value of imcomplete character.
int value;
// Number of continuation bytes still to emit.
int bytes_todo;
public int write (char[] inbuffer, int inpos, int inlength)
{
int start_pos = inpos;
int avail = buf.length - count;
for (;;)
{
if (inlength == 0 || avail == 0)
break;
// The algororith is made more complicated because we want to write
// at least one byte in the output buffer, if there is room for
// that byte, and at least one input character is available.
// This makes the code more robust, since client code will
// always "make progress", even in the complicated cases,
// where the output buffer only has room for only *part* of a
// multi-byte sequence, or the input char buffer only has half
// of a surrogate pair (when standardUTF8 is set), or both.
// Handle continuation characters we did not have room for before.
if (bytes_todo > 0)
{
do
{
bytes_todo--;
buf[count++] = (byte)
(((value >> (bytes_todo * 6)) & 0xC0) | 0x80);
avail--;
}
while (bytes_todo > 0 && avail > 0);
continue;
}
char ch = inbuffer[inpos++];
inlength--;
if (ch < 128 && (ch != 0 || standardUTF8))
{
avail--;
buf[count++] = (byte) ch;
}
else if (ch <= 0x07FF)
{
buf[count++] = (byte) (0xC0 | (ch >> 6));
if (--avail > 0)
{
buf[count++] = (byte) ((ch & 0x3F) | 0x80);
avail--;
}
else
{
value = ch;
bytes_todo = 1;
break;
}
}
else if (ch >= 0xD800 && ch <= 0xDFFF && standardUTF8)
{
if (ch <= 0xDBFF) // High surrogates
{
// The first byte is (0xF0 | value>>18), where value is the
// Unicode scalar value of the combine character - which
// we may not know yet. But from substituting:
// value == (hi-0xD800)*0x400+(lo-0xDC00)+0x10000,
// hi==ch, and cancelling we get:
buf[count++] = (byte) (0xF0 | ((ch-0xD800) >> 8));
avail--;
hi_part = ch;
}
else // Low surrogates
{
value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;
bytes_todo = 3;
}
}
else
{
buf[count++] = (byte) (0xE0 | (ch >> 12));
value = ch;
avail--;
bytes_todo = 2;
}
}
return inpos - start_pos;
}
}

View file

@ -0,0 +1,90 @@
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.convert;
public abstract class UnicodeToBytes
{
/** Buffer to emit bytes to.
* The locations buf[count] ... buf[buf.length-1] are available. */
public byte[] buf;
public int count;
static Class defaultEncodingClass;
static synchronized void getDefaultEncodingClass()
{
// Test (defaultEncodingClass == null) again in case of race condition.
if (defaultEncodingClass == null)
{
String encoding = System.getProperty("file.encoding");
String className = "gnu.gcj.convert.Output_"+encoding;
try
{
defaultEncodingClass = Class.forName(className);
}
catch (ClassNotFoundException ex)
{
throw new NoClassDefFoundError("missing default encoding "
+ encoding + " (class "
+ className + " not found)");
}
}
}
public abstract String getName();
public static UnicodeToBytes getDefaultEncoder()
{
try
{
if (defaultEncodingClass == null)
getDefaultEncodingClass();
return (UnicodeToBytes) defaultEncodingClass.newInstance();
}
catch (Throwable ex)
{
return new Output_8859_1();
}
}
/** Get a char-stream->byte-stream converter given an encoding name. */
public static UnicodeToBytes getEncoder (String encoding)
throws java.io.UnsupportedEncodingException
{
String className = "gnu.gcj.convert.Output_"+encoding;
Class encodingClass;
try
{
encodingClass = Class.forName(className);
return (UnicodeToBytes) encodingClass.newInstance();
}
catch (Throwable ex)
{
throw new java.io.UnsupportedEncodingException(encoding + " ("
+ ex + ')');
}
}
public final void setOutput(byte[] buffer, int count)
{
this.buf = buffer;
this.count = count;
}
/** Convert chars to bytes.
* Converted bytes are written to buf, starting at count.
* @param inbuffer sources of characters to convert
* @param inpos index of initial character ininbuffer to convert
* @param inlength number of characters to convert
* @return number of chars converted
* Also, this.count is increment by the number of bytes converted.
*/
public abstract int write (char[] inbuffer, int inpos, int inlength);
}

View file

@ -0,0 +1,154 @@
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
#include <stdio.h>
struct chval
{
unsigned char b1; /* 1st byte */
unsigned char b2; /* 2nd byte */
unsigned short uc; /* unicode value */
};
#define MAP(B1, B2, C) { B1, B2, C },
struct chval chtab_0208[] = {
#include "JIS0208.h"
{ 255, 255, 0}
};
struct chval chtab_0212[] = {
#include "JIS0212.h"
{ 255, 255, 0}
};
#undef MAP
struct chval sorted[] = {
#define MAP(B1, B2, C) { B1, B2, C },
#include "JIS0208.h"
#undef MAP
#define MAP(B1, B2, C) { 0x80|B1, B2, C },
#include "JIS0212.h"
#undef MAP
};
struct chval *chtab;
int
compare (void *p1, void *p2)
{
struct chval *c1 = (struct chval *) p1;
struct chval *c2 = (struct chval *) p2;
return (int) c1->uc - (int) c2->uc;
}
int
main(int argc, char** argv)
{
FILE *out = stdout;
unsigned min1 = 256, max1 = 0, min2 = 256, max2 = 0, count = 0;
unsigned short low1_uc = 0xFFFF, high1_uc = 0;
unsigned short low2_uc = 0xFFFF, high2_uc = 0;
int i; int row, col;
if (strcmp (argv[1], "JIS0208") == 0)
chtab = chtab_0208;
else if (strcmp (argv[1], "JIS0212") == 0)
chtab = chtab_0212;
else if (strcmp (argv[1], "toJIS") == 0)
{
int i;
int count = sizeof(sorted)/sizeof(struct chval);
qsort (sorted, count, sizeof(struct chval),
compare);
for (i = 0; i < count; i++)
{
fprintf (out, " 0x%04x -> 0x%02x, 0x%02x\n",
sorted[i].uc, sorted[i].b1, sorted[i].b2);
}
exit(0);
}
else
{
fprintf (stderr, "bad argument!");
exit (-1);
}
for (i = 0; chtab[i].b1 != 255; i++)
{
if (chtab[i].b1 < min1) min1 = chtab[i].b1;
if (chtab[i].b2 < min2) min2 = chtab[i].b2;
if (chtab[i].b1 > max1) max1 = chtab[i].b1;
if (chtab[i].b2 > max2) max2 = chtab[i].b2;
count++;
}
fprintf(stderr, "1st byte ranges from %d to %d.\n", min1, max1);
fprintf(stderr, "2nd byte ranges from %d to %d.\n", min2, max2);
fprintf(out,"/* This file is automatically generated from %s.TXT. */\n",
argv[1]);
fprintf(out, "unsigned short %s_to_Unicode[%d][%d] = {\n",
argv[1], max1 - min1 + 1, max2 - min2 + 1);
i = 0;
for (row = min1; row <= max1; row++)
{
fprintf(out, "/* 1st byte: %d */ { ", row);
if (row < chtab[i].b1)
{
fprintf(out, "0 }, /* unused row */\n");
}
else if (row > chtab[i].b1)
{
fprintf (stderr, "error - char table out of order!\n");
exit (-1);
}
else
{
fprintf(out, "\n");
for (col = min2; col <= max2; col++)
{
if (row == chtab[i].b1 && col == chtab[i].b2)
{
unsigned uc = chtab[i].uc;
if (uc < 0x2000)
{
if (uc > high1_uc)
high1_uc = uc;
if (uc < low1_uc)
low1_uc = uc;
}
else
{
if (uc > high2_uc)
high2_uc = uc;
if (uc < low2_uc)
low2_uc = uc;
}
fprintf (out, " /* 2nd byte: %d */ 0x%04x",
chtab[i].b2, uc);
i++;
}
else if (row < chtab[i].b1
|| (row == chtab[i].b1 && col < chtab[i].b2))
{
fprintf (out, " 0");
}
else
{
fprintf (stderr, "error - char table our of order!\n");
exit (-1);
}
if (col != max2)
fprintf (out, ",\n");
}
fprintf(out, row == max1 ? "}\n" : "},\n");
}
}
fprintf(out, "};\n");
fprintf(stderr, "total number of characters is %d.\n", count);
fprintf(stderr, "Range is 0x%04x-0x%04x and 0x%04x-0x%04x.\n",
low1_uc, high1_uc, low2_uc, high2_uc);
return 0;
}

View file

@ -0,0 +1,101 @@
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
#include <config.h>
#include <cni.h>
#include <gnu/gcj/convert/Input_EUCJIS.h>
#define ERROR_CHAR 0xFFFD
extern unsigned short JIS0208_to_Unicode[84][94];
extern unsigned short JIS0212_to_Unicode[76][94];
jint
gnu::gcj::convert::Input_EUCJIS::read(jcharArray outbuffer, jint outpos,
jint outlength)
{
jint start_outpos = outpos;
for (;;)
{
if (outpos >= outlength)
break;
if (inpos >= inlength)
break;
int b = ((unsigned char*) elements(inbuffer))[inpos++];
if (codeset == 0) // ASCII or JIS-Roman
{
if (b < 128)
{
#if 0
// Technically, we should translate 0x5c to Yen symbol;
// in practice, it is not clear.
if (b == 0x5c)
b = 0x00A5; // Yen sign.
#endif
elements(outbuffer)[outpos++] = (char) b;
}
else
{
if (b == 0x8E) // SS2
codeset = 2;
else if (b == 0x8F) // SS3
codeset = 3;
else
{
codeset = 1;
first_byte = b;
}
}
}
else if (codeset == 1) // JIS X 0208:1997
{
first_byte -= 0x80 + 33;
b -= 0x80 + 33;
if ((unsigned) first_byte >= 84 || (unsigned) b >= 94)
b = ERROR_CHAR;
else
{
b = JIS0208_to_Unicode[first_byte][b];
if (b == 0)
b = ERROR_CHAR;
}
elements(outbuffer)[outpos++] = b;
codeset = 0;
}
else if (codeset == 2) // Half-width katakana
{
if (b >= 0xA1 && b <= 0xDF)
b += 0xFF61 - 0xA1;
else
b = ERROR_CHAR;
elements(outbuffer)[outpos++] = b;
codeset = 0;
}
else if (codeset == 3) // second byte of JIS X 0212-1990
{
first_byte = b;
codeset = 4;
}
else // codeset == 4 // third byte of JIS X 0212-1990
{
first_byte -= 0x80 + 34;
b -= 0x80 + 33;
if ((unsigned) first_byte >= 76 || (unsigned) b >= 94)
b = ERROR_CHAR;
else
{
b = JIS0208_to_Unicode[first_byte][b];
if (b == 0)
b = ERROR_CHAR;
}
elements(outbuffer)[outpos++] = b;
codeset = 0;
}
}
return outpos - start_outpos;
}

View file

@ -0,0 +1,285 @@
// Connection.java - Implementation of HttpURLConnection for http protocol.
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.protocol.http;
import java.net.*;
import java.io.*;
import java.util.Vector;
import java.util.Hashtable;
import java.util.Enumeration;
/**
* @author Warren Levy <warrenl@cygnus.com>
* @date March 29, 1999.
*/
/**
* Written using on-line Java Platform 1.2 API Specification, as well
* as "The Java Class Libraries", 2nd edition (Addison-Wesley, 1998).
* Status: Minimal subset of functionality. Proxies and Redirects
* not yet handled. FileNameMap handling needs to be considered.
* useCaches, ifModifiedSince, and allowUserInteraction need
* consideration as well as doInput and doOutput.
*/
class Connection extends HttpURLConnection
{
protected Socket sock = null;
private static Hashtable defRequestProperties = new Hashtable();
private Hashtable requestProperties;
private Hashtable hdrHash = new Hashtable();
private Vector hdrVec = new Vector();
private boolean gotHeaders = false;
private BufferedInputStream bufferedIn;
public Connection(URL url)
{
super(url);
requestProperties = (Hashtable) defRequestProperties.clone();
}
// Override method in URLConnection.
public static void setDefaultRequestProperty(String key, String value)
{
defRequestProperties.put(key, value);
}
// Override method in URLConnection.
public static String getDefaultRequestProperty(String key)
{
return (String) defRequestProperties.get(key);
}
// Override method in URLConnection.
public void setRequestProperty(String key, String value)
{
if (connected)
throw new IllegalAccessError("Connection already established.");
requestProperties.put(key, value);
}
// Override method in URLConnection.
public String getRequestProperty(String key)
{
if (connected)
throw new IllegalAccessError("Connection already established.");
return (String) requestProperties.get(key);
}
// Implementation of abstract method.
public void connect() throws IOException
{
// Call is ignored if already connected.
if (connected)
return;
// Get address and port number.
int port;
InetAddress destAddr = InetAddress.getByName(url.getHost());
if ((port = url.getPort()) == -1)
port = 80;
// Open socket and output stream.
sock = new Socket(destAddr, port);
PrintWriter out = new PrintWriter(sock.getOutputStream());
// Send request including any request properties that were set.
out.print(getRequestMethod() + " " + url.getFile() + " HTTP/1.1\n");
out.print("Host: " + url.getHost() + ":" + port + "\n");
Enumeration reqKeys = requestProperties.keys();
Enumeration reqVals = requestProperties.elements();
while (reqKeys.hasMoreElements())
out.print(reqKeys.nextElement() + ": " + reqVals.nextElement() + "\n");
out.print("\n");
out.flush();
connected = true;
}
// Implementation of abstract method.
public void disconnect()
{
if (sock != null)
{
try
{
sock.close();
}
catch (IOException ex)
{
; // Ignore errors in closing socket.
}
sock = null;
}
connected = false;
}
// TODO: public boolean usingProxy()
public boolean usingProxy()
{
throw new InternalError("HttpURLConnection.usingProxy not implemented");
}
// Override default method in URLConnection.
public InputStream getInputStream() throws IOException
{
if (!connected)
connect();
if (bufferedIn == null)
bufferedIn = new BufferedInputStream(sock.getInputStream());
return bufferedIn;
}
// Override default method in URLConnection.
public OutputStream getOutputStream() throws IOException
{
if (!connected)
connect();
return sock.getOutputStream();
}
// Override default method in URLConnection.
public String getHeaderField(String name)
{
try
{
getHttpHeaders();
}
catch (IOException x)
{
return null;
}
return (String) hdrHash.get(name.toLowerCase());
}
// Override default method in URLConnection.
public String getHeaderField(int n)
{
try
{
getHttpHeaders();
}
catch (IOException x)
{
return null;
}
if (n < hdrVec.size())
return getField((String) hdrVec.elementAt(n));
return null;
}
// Override default method in URLConnection.
public String getHeaderFieldKey(int n)
{
try
{
getHttpHeaders();
}
catch (IOException x)
{
return null;
}
if (n < hdrVec.size())
return getKey((String) hdrVec.elementAt(n));
return null;
}
private String getKey(String str)
{
if (str == null)
return null;
int index = str.indexOf(':');
if (index >= 0)
return str.substring(0, index);
else
return null;
}
private String getField(String str)
{
if (str == null)
return null;
int index = str.indexOf(':');
if (index >= 0)
return str.substring(index + 1).trim();
else
return str;
}
private void getHttpHeaders() throws IOException
{
if (gotHeaders)
return;
gotHeaders = true;
connect();
// Originally tried using a BufferedReader here to take advantage of
// the readLine method and avoid the following, but the buffer read
// past the end of the headers so the first part of the content was lost.
// It is probably more robust than it needs to be, e.g. the byte[]
// is unlikely to overflow and a '\r' should always be followed by a '\n',
// but it is better to be safe just in case.
if (bufferedIn == null)
bufferedIn = new BufferedInputStream(sock.getInputStream());
int buflen = 100;
byte[] buf = new byte[buflen];
String line = "";
boolean gotnl = false;
byte[] ch = new byte[1];
ch[0] = (byte) '\n';
while (true)
{
// Check for leftover byte from non-'\n' after a '\r'.
if (ch[0] != '\n')
line = line + '\r' + new String(ch, 0, 1);
int i;
for (i = 0; i < buflen; i++)
{
bufferedIn.read(buf, i, 1);
if (buf[i] == '\r')
{
bufferedIn.read(ch, 0, 1);
if (ch[0] == '\n')
gotnl = true;
break;
}
}
line = line + new String(buf, 0, i);
// A '\r' '\n' combo indicates the end of the header entry.
// If it wasn't found, cycle back through the loop and append
// to 'line' until one is found.
if (gotnl)
{
// A zero length entry signals the end of the headers.
if (line.length() == 0)
break;
// Store the header and reinitialize for next cycle.
hdrVec.addElement(line);
String key = getKey(line);
if (key != null)
hdrHash.put(key.toLowerCase(), getField(line));
line = "";
ch[0] = (byte) '\n';
gotnl = false;
}
}
}
}

View file

@ -0,0 +1,35 @@
// Handler.java - URLStreamHandler for http protocol.
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.protocol.http;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLStreamHandler;
import java.io.IOException;
/**
* @author Warren Levy <warrenl@cygnus.com>
* @date March 26, 1999.
*/
/**
* Written using on-line Java Platform 1.2 API Specification, as well
* as "The Java Class Libraries", 2nd edition (Addison-Wesley, 1998).
* Status: Minimal functionality.
*/
public class Handler extends URLStreamHandler
{
protected URLConnection openConnection(URL url) throws IOException
{
return new Connection(url);
}
}

View file

@ -0,0 +1,82 @@
// Base class for default BreakIterators.
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.text;
import java.text.BreakIterator;
import java.text.CharacterIterator;
/**
* @author Tom Tromey <tromey@cygnus.com>
* @date March 22, 1999
*/
public abstract class BaseBreakIterator extends BreakIterator
{
public int current ()
{
return iter.getIndex();
}
public int first ()
{
iter.first();
return iter.getBeginIndex();
}
public int following (int pos)
{
int save = iter.getIndex();
iter.setIndex(pos);
int r = next ();
iter.setIndex(save);
return r;
}
public CharacterIterator getText ()
{
return iter;
}
public int last ()
{
iter.last();
return iter.getEndIndex();
}
public int next (int n)
{
int r = iter.getIndex ();
if (n > 0)
{
while (n > 0 && r != DONE)
{
r = next ();
--n;
}
}
else if (n < 0)
{
while (n < 0 && r != DONE)
{
r = previous ();
++n;
}
}
return r;
}
public void setText (CharacterIterator newText)
{
iter = newText;
}
protected CharacterIterator iter;
}

View file

@ -0,0 +1,188 @@
// Default character BreakIterator.
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.text;
import java.text.BreakIterator;
import java.text.CharacterIterator;
/**
* @author Tom Tromey <tromey@cygnus.com>
* @date March 19, 1999
* Written using The Unicode Standard, Version 2.0.
*/
public class CharacterBreakIterator extends BaseBreakIterator
{
// Hangul Jamo constants from Unicode book.
private static final int LBase = 0x1100;
private static final int VBase = 0x1161;
private static final int TBase = 0x11a7;
private static final int LCount = 19;
private static final int VCount = 21;
private static final int TCount = 28;
// Information about surrogates.
private static final int highSurrogateStart = 0xD800;
private static final int highSurrogateEnd = 0xDBFF;
private static final int lowSurrogateStart = 0xDC00;
private static final int lowSurrogateEnd = 0xDFFF;
public Object clone ()
{
return new CharacterBreakIterator (this);
}
public CharacterBreakIterator ()
{
iter = null; // FIXME?
}
private CharacterBreakIterator (CharacterBreakIterator other)
{
iter = (CharacterIterator) other.iter.clone();
}
// Some methods to tell us different properties of characters.
private final boolean isL (char c)
{
return c >= LBase && c <= LBase + LCount;
}
private final boolean isV (char c)
{
return c >= VBase && c <= VBase + VCount;
}
private final boolean isT (char c)
{
return c >= TBase && c <= TBase + TCount;
}
private final boolean isLVT (char c)
{
return isL (c) || isV (c) || isT (c);
}
private final boolean isHighSurrogate (char c)
{
return c >= highSurrogateStart && c <= highSurrogateEnd;
}
private final boolean isLowSurrogate (char c)
{
return c >= lowSurrogateStart && c <= lowSurrogateEnd;
}
public int next ()
{
int end = iter.getEndIndex();
if (iter.getIndex() == end)
return DONE;
char c;
for (char prev = CharacterIterator.DONE; iter.getIndex() < end; prev = c)
{
c = iter.next();
if (c == CharacterIterator.DONE)
break;
int type = Character.getType(c);
// Break after paragraph separators.
if (type == Character.PARAGRAPH_SEPARATOR)
break;
// Now we need some lookahead.
char ahead = iter.next();
iter.previous();
if (ahead == CharacterIterator.DONE)
break;
int aheadType = Character.getType(ahead);
if (aheadType != Character.NON_SPACING_MARK
&& ! isLowSurrogate (ahead)
&& ! isLVT (ahead))
break;
if (! isLVT (c) && isLVT (ahead))
break;
if (isL (c) && ! isLVT (ahead)
&& aheadType != Character.NON_SPACING_MARK)
break;
if (isV (c) && ! isV (ahead) && !isT (ahead)
&& aheadType != Character.NON_SPACING_MARK)
break;
if (isT (c) && ! isT (ahead)
&& aheadType != Character.NON_SPACING_MARK)
break;
if (! isHighSurrogate (c) && isLowSurrogate (ahead))
break;
if (isHighSurrogate (c) && ! isLowSurrogate (ahead))
break;
if (! isHighSurrogate (prev) && isLowSurrogate (c))
break;
}
return iter.getIndex();
}
public int previous ()
{
if (iter.getIndex() == iter.getBeginIndex())
return DONE;
int start = iter.getBeginIndex();
while (iter.getIndex() >= iter.getBeginIndex())
{
char c = iter.previous();
if (c == CharacterIterator.DONE)
break;
int type = Character.getType(c);
if (type != Character.NON_SPACING_MARK
&& ! isLowSurrogate (c)
&& ! isLVT (c))
break;
// Now we need some lookahead.
char ahead = iter.previous();
if (ahead == CharacterIterator.DONE)
{
iter.next();
break;
}
char ahead2 = iter.previous();
iter.next();
iter.next();
if (ahead2 == CharacterIterator.DONE)
break;
int aheadType = Character.getType(ahead);
if (aheadType == Character.PARAGRAPH_SEPARATOR)
break;
if (isLVT (c) && ! isLVT (ahead))
break;
if (! isLVT (c) && type != Character.NON_SPACING_MARK
&& isL (ahead))
break;
if (! isV (c) && ! isT (c) && type != Character.NON_SPACING_MARK
&& isV (ahead))
break;
if (! isT (c) && type != Character.NON_SPACING_MARK
&& isT (ahead))
break;
if (isLowSurrogate (c) && ! isHighSurrogate (ahead))
break;
if (! isLowSurrogate (c) && isHighSurrogate (ahead))
break;
if (isLowSurrogate (ahead) && ! isHighSurrogate (ahead2))
break;
}
return iter.getIndex();
}
}

View file

@ -0,0 +1,168 @@
// Default word BreakIterator.
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.text;
import java.text.BreakIterator;
import java.text.CharacterIterator;
/**
* @author Tom Tromey <tromey@cygnus.com>
* @date March 22, 1999
* Written using The Unicode Standard, Version 2.0.
*/
public class LineBreakIterator extends BaseBreakIterator
{
public Object clone ()
{
return new LineBreakIterator (this);
}
public LineBreakIterator ()
{
iter = null;
}
private LineBreakIterator (LineBreakIterator other)
{
iter = (CharacterIterator) other.iter.clone();
}
// Some methods to tell us different properties of characters.
private final boolean isNb (char c)
{
return (c == 0x00a0 // NO-BREAK SPACE
|| c == 0x2011 // NON-BREAKING HYPHEN
|| c == 0xfeff); // ZERO WITH NO-BREAK SPACE
}
private final boolean isClose (int type)
{
return (type == Character.END_PUNCTUATION
// Unicode book says "comma, period, ...", which I take to
// mean "Po" class.
|| type == Character.OTHER_PUNCTUATION);
}
private final boolean isIdeo (char c)
{
return (c >= 0x3040 && c <= 0x309f // Hiragana
|| c >= 0x30a0 && c <= 0x30ff // Katakana
|| c >= 0x4e00 && c <= 0x9fff // Han
|| c >= 0x3100 && c <= 0x312f); // Bopomofo
}
public int next ()
{
int end = iter.getEndIndex();
if (iter.getIndex() == end)
return DONE;
while (iter.getIndex() < end)
{
char c = iter.current();
int type = Character.getType(c);
char n = iter.next();
if (n == CharacterIterator.DONE
|| type == Character.PARAGRAPH_SEPARATOR
|| type == Character.LINE_SEPARATOR)
break;
// Handle two cases where we must scan for non-spacing marks.
int start = iter.getIndex();
if (type == Character.SPACE_SEPARATOR
|| type == Character.START_PUNCTUATION
|| isIdeo (c))
{
while (n != CharacterIterator.DONE
&& Character.getType(n) == Character.NON_SPACING_MARK)
n = iter.next();
if (n == CharacterIterator.DONE)
break;
if (type == Character.SPACE_SEPARATOR)
{
int nt = Character.getType(n);
if (nt != Character.NON_SPACING_MARK
&& nt != Character.SPACE_SEPARATOR
&& ! isNb (n))
break;
}
else if (type == Character.START_PUNCTUATION)
{
if (isIdeo (n))
{
// Open punctuation followed by non spacing marks
// and then ideograph does not have a break in
// it. So skip all this.
start = iter.getIndex();
}
}
else
{
// Ideograph preceded this character.
if (isClose (Character.getType(n)))
break;
}
}
iter.setIndex(start);
}
return iter.getIndex();
}
public int previous ()
{
int start = iter.getBeginIndex();
if (iter.getIndex() == start)
return DONE;
while (iter.getIndex() >= start)
{
char c = iter.previous();
if (c == CharacterIterator.DONE)
break;
int type = Character.getType(c);
char n = iter.previous();
if (n == CharacterIterator.DONE)
break;
iter.next();
int nt = Character.getType(n);
// Break after paragraph separators.
if (nt == Character.PARAGRAPH_SEPARATOR
|| nt == Character.LINE_SEPARATOR)
break;
// Skip non-spacing marks.
int init = iter.getIndex();
while (n != CharacterIterator.DONE && nt == Character.NON_SPACING_MARK)
{
n = iter.previous();
nt = Character.getType(n);
}
if (nt == Character.SPACE_SEPARATOR
&& type != Character.SPACE_SEPARATOR
&& type != Character.NON_SPACING_MARK
&& ! isNb (c))
break;
if (! isClose (type) && isIdeo (n))
break;
if (isIdeo (c) && nt != Character.START_PUNCTUATION)
break;
iter.setIndex(init);
}
return iter.getIndex();
}
}

View file

@ -0,0 +1,75 @@
// Generic English locale data for java.text.
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.text;
import java.util.ListResourceBundle;
/**
* @author Tom Tromey <tromey@cygnus.com>
* @date March 4, 1999
*/
public final class LocaleData_en extends ListResourceBundle
{
// These are for DateFormatSymbols.
static final String[] ampmsDefault = {"AM", "PM" };
static final String[] erasDefault = {"BC", "AD" };
static final String localPatternCharsDefault = "GyMdkHmsSEDFwWahKz";
static final String[] monthsDefault = {
"January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December", ""
};
static final String[] shortMonthsDefault = {
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec", ""
};
static final String[] shortWeekdaysDefault = {
"", "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
};
static final String[] weekdaysDefault = {
"", "Sunday", "Monday", "Tuesday",
"Wednesday", "Thursday", "Friday", "Saturday"
};
private static final Object[][] contents =
{
// These are for DecimalFormatSymbols.
{ "decimalSeparator", "." },
{ "digit", "#" },
{ "exponential", "E" },
{ "groupingSeparator", "," },
{ "infinity", "\u221e" },
{ "minusSign", "-" },
{ "NaN", "\ufffd" },
{ "patternSeparator", ";" },
{ "percent", "%" },
{ "perMill", "\u2030" },
{ "zeroDigit", "0" },
// These are for NumberFormat.
{ "numberFormat", "#,##0.###" },
{ "percentFormat", "#,##0%" },
// These are for DateFormatSymbols.
{ "ampm", ampmsDefault },
{ "eras", erasDefault },
{ "datePatternChars", localPatternCharsDefault },
{ "months", monthsDefault },
{ "shortMonths", shortMonthsDefault },
{ "shortWeekdays", shortWeekdaysDefault },
{ "weekdays", weekdaysDefault }
};
protected Object[][] getContents ()
{
return contents;
}
}

View file

@ -0,0 +1,71 @@
// US English locale data for java.text.
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.text;
import java.util.ListResourceBundle;
/**
* @author Tom Tromey <tromey@cygnus.com>
* @date March 4, 1999
*/
public final class LocaleData_en_US extends ListResourceBundle
{
// These are for DateFormatSymbols.
static String[][] zoneStringsDefault = {
{ "PST", "Pacific Standard Time", "PST",
/**/ "Pacific Daylight Time", "PDT", "San Francisco" },
{ "MST", "Mountain Standard Time", "MST",
/**/ "Mountain Daylight Time", "MDT", "Denver" },
{ "PNT", "Mountain Standard Time", "MST",
/**/ "Mountain Standard Time", "MST", "Phoenix" },
{ "CST", "Central Standard Time", "CST",
/**/ "Central Daylight Time", "CDT", "Chicago" },
{ "EST", "Eastern Standard Time", "EST",
/**/ "Eastern Daylight Time", "EDT", "Boston" },
{ "IET", "Eastern Standard Time", "EST",
/**/ "Eastern Standard Time", "EST", "Indianapolis" },
{ "PRT", "Atlantic Standard Time", "AST",
/**/ "Atlantic Daylight Time", "ADT", "Halifax" },
{ "HST", "Hawaii Standard Time", "HST",
/**/ "Hawaii Daylight Time", "HDT", "Honolulu" },
{ "AST", "Alaska Standard Time", "AST",
/**/ "Alaska Daylight Time", "ADT", "Anchorage" }
};
private static final Object[][] contents =
{
// These are for DecimalFormatSymbols.
{ "currency", "$" },
{ "intlCurrencySymbol", "$" }, // FIXME?
// These are for NumberFormat.
{ "currencyFormat", "$#,##0.00;($#,##0.00)" },
// These are for DateFormatSymbols.
{ "zoneStrings", zoneStringsDefault },
// These are for DateFormat.
{ "shortDateFormat", "M/d/yy" }, // Java's Y2K bug.
{ "mediumDateFormat", "d-MMM-yy" },
{ "longDateFormat", "MMMM d, yyyy" },
{ "fullDateFormat", "EEEE MMMM d, yyyy G" },
{ "shortTimeFormat", "h:mm a" },
{ "mediumTimeFormat", "h:mm:ss a" },
{ "longTimeFormat", "h:mm:ss a z" },
{ "fullTimeFormat", "h:mm:ss;S 'o''clock' a z" }
};
protected Object[][] getContents ()
{
return contents;
}
}

View file

@ -0,0 +1,226 @@
// Default sentence BreakIterator.
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.text;
import java.text.BreakIterator;
import java.text.CharacterIterator;
/**
* @author Tom Tromey <tromey@cygnus.com>
* @date March 23, 1999
* Written using The Unicode Standard, Version 2.0.
*/
public class SentenceBreakIterator extends BaseBreakIterator
{
public Object clone ()
{
return new SentenceBreakIterator (this);
}
public SentenceBreakIterator ()
{
iter = null;
}
private SentenceBreakIterator (SentenceBreakIterator other)
{
iter = (CharacterIterator) other.iter.clone();
}
public int next ()
{
int end = iter.getEndIndex();
if (iter.getIndex() == end)
return DONE;
while (iter.getIndex() < end)
{
char c = iter.current();
if (c == CharacterIterator.DONE)
break;
int type = Character.getType(c);
char n = iter.next();
if (n == CharacterIterator.DONE)
break;
// Always break after paragraph separator.
if (type == Character.PARAGRAPH_SEPARATOR)
break;
if (c == '!' || c == '?')
{
// Skip close punctuation.
while (n != CharacterIterator.DONE
&& Character.getType(n) == Character.END_PUNCTUATION)
n = iter.next();
// Skip spaces.
while (n != CharacterIterator.DONE
&& Character.getType(n) == Character.SPACE_SEPARATOR)
n = iter.next();
// Skip optional paragraph separator.
if (n != CharacterIterator.DONE
&& Character.getType(n) == Character.PARAGRAPH_SEPARATOR)
n = iter.next();
// There's always a break somewhere after `!' or `?'.
break;
}
if (c == '.')
{
int save = iter.getIndex();
// Skip close punctuation.
while (n != CharacterIterator.DONE
&& Character.getType(n) == Character.END_PUNCTUATION)
n = iter.next();
// Skip spaces. We keep count because we need at least
// one for this period to represent a terminator.
int spcount = 0;
while (n != CharacterIterator.DONE
&& Character.getType(n) == Character.SPACE_SEPARATOR)
{
n = iter.next();
++spcount;
}
if (spcount > 0)
{
int save2 = iter.getIndex();
// Skip over open puncutation.
while (n != CharacterIterator.DONE
&& Character.getType(n) == Character.START_PUNCTUATION)
n = iter.next();
// Next character must not be lower case.
if (n == CharacterIterator.DONE
|| ! Character.isLowerCase(n))
{
iter.setIndex(save2);
break;
}
}
iter.setIndex(save);
}
}
return iter.getIndex();
}
private final int previous_internal ()
{
int start = iter.getBeginIndex();
if (iter.getIndex() == start)
return DONE;
while (iter.getIndex() >= start)
{
char c = iter.previous();
if (c == CharacterIterator.DONE)
break;
char n = iter.previous();
if (n == CharacterIterator.DONE)
break;
iter.next();
int nt = Character.getType(n);
if (! Character.isLowerCase(c)
&& (nt == Character.START_PUNCTUATION
|| nt == Character.SPACE_SEPARATOR))
{
int save = iter.getIndex();
int save_nt = nt;
char save_n = n;
// Skip open punctuation.
while (n != CharacterIterator.DONE
&& Character.getType(n) == Character.START_PUNCTUATION)
n = iter.previous();
if (n == CharacterIterator.DONE)
break;
if (Character.getType(n) == Character.SPACE_SEPARATOR)
{
// Must have at least once space after the `.'.
int save2 = iter.getIndex();
while (n != CharacterIterator.DONE
&& Character.getType(n) == Character.SPACE_SEPARATOR)
n = iter.previous();
// Skip close punctuation.
while (n != CharacterIterator.DONE
&& Character.getType(n) == Character.END_PUNCTUATION)
n = iter.previous();
if (n == CharacterIterator.DONE || n == '.')
{
// Communicate location of actual end.
period = iter.getIndex();
iter.setIndex(save2);
break;
}
}
iter.setIndex(save);
nt = save_nt;
n = save_n;
}
if (nt == Character.PARAGRAPH_SEPARATOR)
{
// Communicate location of actual end.
period = iter.getIndex();
break;
}
else if (nt == Character.SPACE_SEPARATOR
|| nt == Character.END_PUNCTUATION)
{
int save = iter.getIndex();
// Skip spaces.
while (n != CharacterIterator.DONE
&& Character.getType(n) == Character.SPACE_SEPARATOR)
n = iter.previous();
// Skip close punctuation.
while (n != CharacterIterator.DONE
&& Character.getType(n) == Character.END_PUNCTUATION)
n = iter.previous();
int here = iter.getIndex();
iter.setIndex(save);
if (n == CharacterIterator.DONE || n == '!' || n == '?')
{
// Communicate location of actual end.
period = here;
break;
}
}
else if (n == '!' || n == '?')
{
// Communicate location of actual end.
period = iter.getIndex();
break;
}
}
return iter.getIndex();
}
public int previous ()
{
// We want to skip over the first sentence end to the second one.
// However, at the end of the string we want the first end.
int here = iter.getIndex();
period = here;
int first = previous_internal ();
if (here == iter.getEndIndex() || first == DONE)
return first;
iter.setIndex(period);
return previous_internal ();
}
// This is used for communication between previous and
// previous_internal.
private int period;
}

View file

@ -0,0 +1,224 @@
// Default word BreakIterator.
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.text;
import java.text.BreakIterator;
import java.text.CharacterIterator;
/**
* @author Tom Tromey <tromey@cygnus.com>
* @date March 22, 1999
* Written using The Unicode Standard, Version 2.0.
*/
public class WordBreakIterator extends BaseBreakIterator
{
public Object clone ()
{
return new WordBreakIterator (this);
}
public WordBreakIterator ()
{
iter = null;
}
private WordBreakIterator (WordBreakIterator other)
{
iter = (CharacterIterator) other.iter.clone();
}
// Some methods to tell us different properties of characters.
private final boolean isHira (char c)
{
return c >= 0x3040 && c <= 0x309f;
}
private final boolean isKata (char c)
{
return c >= 0x30a0 && c <= 0x30ff;
}
private final boolean isHan (char c)
{
return c >= 0x4e00 && c <= 0x9fff;
}
public int next ()
{
int end = iter.getEndIndex();
if (iter.getIndex() == end)
return DONE;
while (iter.getIndex() < end)
{
char c = iter.current();
if (c == CharacterIterator.DONE)
break;
int type = Character.getType(c);
char n = iter.next();
if (n == CharacterIterator.DONE)
break;
// Break after paragraph separators.
if (type == Character.PARAGRAPH_SEPARATOR
|| type == Character.LINE_SEPARATOR)
break;
// Break between letters and non-letters.
// FIXME: we treat apostrophe as part of a word. This
// is an English-ism.
boolean is_letter = Character.isLetter(c);
if (c != '\'' && ! is_letter && type != Character.NON_SPACING_MARK
&& Character.isLetter(n))
break;
// Always break after certain symbols, such as punctuation.
// This heuristic is derived from hints in the JCL book and is
// not part of Unicode. It seems to be right, however.
// FIXME: we treat apostrophe as part of a word. This
// is an English-ism.
if (c != '\''
&& (type == Character.DASH_PUNCTUATION
|| type == Character.START_PUNCTUATION
|| type == Character.END_PUNCTUATION
|| type == Character.CONNECTOR_PUNCTUATION
|| type == Character.OTHER_PUNCTUATION
|| type == Character.MATH_SYMBOL
|| type == Character.CURRENCY_SYMBOL
|| type == Character.MODIFIER_SYMBOL
|| type == Character.OTHER_SYMBOL
|| type == Character.FORMAT
|| type == Character.CONTROL))
break;
boolean is_hira = isHira (c);
boolean is_kata = isKata (c);
boolean is_han = isHan (c);
// Special case Japanese.
if (! is_hira && ! is_kata && ! is_han
&& type != Character.NON_SPACING_MARK
&& (isHira (n) || isKata (n) || isHan (n)))
break;
if (is_hira || is_kata || is_han || is_letter)
{
// Now we need to do some lookahead. We might need to do
// quite a bit of lookahead, so we save our position and
// restore it later.
int save = iter.getIndex();
// Skip string of non spacing marks.
while (n != CharacterIterator.DONE
&& Character.getType(n) == Character.NON_SPACING_MARK)
n = iter.next();
if (n == CharacterIterator.DONE)
break;
if ((is_hira && ! isHira (n))
|| (is_kata && ! isHira (n) && ! isKata (n))
|| (is_han && ! isHira (n) && ! isHan (n))
// FIXME: we treat apostrophe as part of a word. This
// is an English-ism.
|| (is_letter && ! Character.isLetter(n) && n != '\''))
break;
iter.setIndex(save);
}
}
return iter.getIndex();
}
public int previous ()
{
int start = iter.getBeginIndex();
if (iter.getIndex() == start)
return DONE;
while (iter.getIndex() >= start)
{
char c = iter.previous();
if (c == CharacterIterator.DONE)
break;
boolean is_hira = isHira (c);
boolean is_kata = isKata (c);
boolean is_han = isHan (c);
boolean is_letter = Character.isLetter(c);
char n = iter.previous();
if (n == CharacterIterator.DONE)
break;
iter.next();
int type = Character.getType(n);
// Break after paragraph separators.
if (type == Character.PARAGRAPH_SEPARATOR
|| type == Character.LINE_SEPARATOR)
break;
// Break between letters and non-letters.
// FIXME: we treat apostrophe as part of a word. This
// is an English-ism.
if (n != '\'' && ! Character.isLetter(n)
&& type != Character.NON_SPACING_MARK
&& is_letter)
break;
// Always break after certain symbols, such as punctuation.
// This heuristic is derived from hints in the JCL book and is
// not part of Unicode. It seems to be right, however.
// FIXME: we treat apostrophe as part of a word. This
// is an English-ism.
if (n != '\''
&& (type == Character.DASH_PUNCTUATION
|| type == Character.START_PUNCTUATION
|| type == Character.END_PUNCTUATION
|| type == Character.CONNECTOR_PUNCTUATION
|| type == Character.OTHER_PUNCTUATION
|| type == Character.MATH_SYMBOL
|| type == Character.CURRENCY_SYMBOL
|| type == Character.MODIFIER_SYMBOL
|| type == Character.OTHER_SYMBOL
|| type == Character.FORMAT
|| type == Character.CONTROL))
break;
// Special case Japanese.
if ((is_hira || is_kata || is_han)
&& ! isHira (n) && ! isKata (n) && ! isHan (n)
&& type != Character.NON_SPACING_MARK)
break;
// We might have to skip over non spacing marks to see what's
// on the other side.
if (! is_hira || (! is_letter && c != '\''))
{
int save = iter.getIndex();
while (n != CharacterIterator.DONE
&& Character.getType(n) == Character.NON_SPACING_MARK)
n = iter.previous();
iter.setIndex(save);
// This is a strange case: a bunch of non-spacing marks at
// the beginning. We treat the current location as a word
// break.
if (n == CharacterIterator.DONE)
break;
if ((isHira (n) && ! is_hira)
|| (isKata (n) && ! is_hira && ! is_kata)
|| (isHan (n) && ! is_hira && ! is_han)
// FIXME: we treat apostrophe as part of a word. This
// is an English-ism.
|| (! is_letter && c != '\'' && Character.isLetter(n)))
break;
}
}
return iter.getIndex();
}
}