Constructor: |
public UTF8Reader(InputStream inputStream) {
//
// Constructors
//
this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault());
}
Constructs a UTF-8 reader from the specified input stream
using the default buffer size. Primarily for testing. Parameters:
inputStream - The input stream.
|
public UTF8Reader(InputStream inputStream,
MessageFormatter messageFormatter,
Locale locale) {
this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale);
}
Constructs a UTF-8 reader from the specified input stream
using the default buffer size and the given MessageFormatter. Parameters:
inputStream - The input stream.
messageFormatter - given MessageFormatter
locale - Locale to use for messages
|
public UTF8Reader(InputStream inputStream,
int size,
MessageFormatter messageFormatter,
Locale locale) {
this(inputStream, new byte[size], messageFormatter, locale);
}
Constructs a UTF-8 reader from the specified input stream,
buffer size and MessageFormatter. Parameters:
inputStream - The input stream.
size - The initial buffer size.
messageFormatter - the formatter for localizing/formatting errors.
locale - the Locale to use for messages
|
public UTF8Reader(InputStream inputStream,
byte[] buffer,
MessageFormatter messageFormatter,
Locale locale) {
fInputStream = inputStream;
fBuffer = buffer;
fFormatter = messageFormatter;
fLocale = locale;
}
Constructs a UTF-8 reader from the specified input stream,
buffer and MessageFormatter. Parameters:
inputStream - The input stream.
buffer - The byte buffer.
messageFormatter - the formatter for localizing/formatting errors.
locale - the Locale to use for messages
|
Method from org.apache.xerces.impl.io.UTF8Reader Detail: |
public void close() throws IOException {
fInputStream.close();
}
Close the stream. Once a stream has been closed, further read(),
ready(), mark(), or reset() invocations will throw an IOException.
Closing a previously-closed stream, however, has no effect. |
public void mark(int readAheadLimit) throws IOException {
throw new IOException(fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object[]{"mark()", "UTF-8"}));
}
Mark the present position in the stream. Subsequent calls to reset()
will attempt to reposition the stream to this point. Not all
character-input streams support the mark() operation. |
public boolean markSupported() {
return false;
}
Tell whether this stream supports the mark() operation. |
public int read() throws IOException {
// decode character
int c = fSurrogate;
if (fSurrogate == -1) {
// NOTE: We use the index into the buffer if there are remaining
// bytes from the last block read. -Ac
int index = 0;
// get first byte
int b0 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b0 == -1) {
return -1;
}
// UTF-8: [0xxx xxxx]
// Unicode: [0000 0000] [0xxx xxxx]
if (b0 < 0x80) {
c = (char)b0;
}
// UTF-8: [110y yyyy] [10xx xxxx]
// Unicode: [0000 0yyy] [yyxx xxxx]
else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
int b1 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b1 == -1) {
expectedByte(2, 2);
}
if ((b1 & 0xC0) != 0x80) {
invalidByte(2, 2, b1);
}
c = ((b0 < < 6) & 0x07C0) | (b1 & 0x003F);
}
// UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
// Unicode: [zzzz yyyy] [yyxx xxxx]
else if ((b0 & 0xF0) == 0xE0) {
int b1 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b1 == -1) {
expectedByte(2, 3);
}
if ((b1 & 0xC0) != 0x80
|| (b0 == 0xED && b1 >= 0xA0)
|| ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
invalidByte(2, 3, b1);
}
int b2 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b2 == -1) {
expectedByte(3, 3);
}
if ((b2 & 0xC0) != 0x80) {
invalidByte(3, 3, b2);
}
c = ((b0 < < 12) & 0xF000) | ((b1 < < 6) & 0x0FC0) |
(b2 & 0x003F);
}
// UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
// Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
// [1101 11yy] [yyxx xxxx] (low surrogate)
// * uuuuu = wwww + 1
else if ((b0 & 0xF8) == 0xF0) {
int b1 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b1 == -1) {
expectedByte(2, 4);
}
if ((b1 & 0xC0) != 0x80
|| ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
invalidByte(2, 3, b1);
}
int b2 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b2 == -1) {
expectedByte(3, 4);
}
if ((b2 & 0xC0) != 0x80) {
invalidByte(3, 3, b2);
}
int b3 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b3 == -1) {
expectedByte(4, 4);
}
if ((b3 & 0xC0) != 0x80) {
invalidByte(4, 4, b3);
}
int uuuuu = ((b0 < < 2) & 0x001C) | ((b1 > > 4) & 0x0003);
if (uuuuu > 0x10) {
invalidSurrogate(uuuuu);
}
int wwww = uuuuu - 1;
int hs = 0xD800 |
((wwww < < 6) & 0x03C0) | ((b1 < < 2) & 0x003C) |
((b2 > > 4) & 0x0003);
int ls = 0xDC00 | ((b2 < < 6) & 0x03C0) | (b3 & 0x003F);
c = hs;
fSurrogate = ls;
}
// error
else {
invalidByte(1, 1, b0);
}
}
// use surrogate
else {
fSurrogate = -1;
}
// return character
if (DEBUG_READ) {
System.out.println("read(): 0x"+Integer.toHexString(c));
}
return c;
}
|
public int read(char[] ch,
int offset,
int length) throws IOException {
// read bytes
int out = offset;
int count = 0;
if (fOffset == 0) {
// adjust length to read
if (length > fBuffer.length) {
length = fBuffer.length;
}
// handle surrogate
if (fSurrogate != -1) {
ch[out++] = (char)fSurrogate;
fSurrogate = -1;
length--;
}
// perform read operation
count = fInputStream.read(fBuffer, 0, length);
if (count == -1) {
return -1;
}
count += out - offset;
}
// skip read; last character was in error
// NOTE: Having an offset value other than zero means that there was
// an error in the last character read. In this case, we have
// skipped the read so we don't consume any bytes past the
// error. By signalling the error on the next block read we
// allow the method to return the most valid characters that
// it can on the previous block read. -Ac
else {
count = fOffset;
fOffset = 0;
}
// convert bytes to characters
final int total = count;
int in;
byte byte1;
final byte byte0 = 0;
for (in = 0; in < total; in++) {
byte1 = fBuffer[in];
if (byte1 >= byte0) {
ch[out++] = (char)byte1;
}
else {
break;
}
}
for ( ; in < total; in++) {
byte1 = fBuffer[in];
// UTF-8: [0xxx xxxx]
// Unicode: [0000 0000] [0xxx xxxx]
if (byte1 >= byte0) {
ch[out++] = (char)byte1;
continue;
}
// UTF-8: [110y yyyy] [10xx xxxx]
// Unicode: [0000 0yyy] [yyxx xxxx]
int b0 = byte1 & 0x0FF;
if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
int b1 = -1;
if (++in < total) {
b1 = fBuffer[in] & 0x00FF;
}
else {
b1 = fInputStream.read();
if (b1 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fOffset = 1;
return out - offset;
}
expectedByte(2, 2);
}
count++;
}
if ((b1 & 0xC0) != 0x80) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fOffset = 2;
return out - offset;
}
invalidByte(2, 2, b1);
}
int c = ((b0 < < 6) & 0x07C0) | (b1 & 0x003F);
ch[out++] = (char)c;
count -= 1;
continue;
}
// UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
// Unicode: [zzzz yyyy] [yyxx xxxx]
if ((b0 & 0xF0) == 0xE0) {
int b1 = -1;
if (++in < total) {
b1 = fBuffer[in] & 0x00FF;
}
else {
b1 = fInputStream.read();
if (b1 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fOffset = 1;
return out - offset;
}
expectedByte(2, 3);
}
count++;
}
if ((b1 & 0xC0) != 0x80
|| (b0 == 0xED && b1 >= 0xA0)
|| ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fOffset = 2;
return out - offset;
}
invalidByte(2, 3, b1);
}
int b2 = -1;
if (++in < total) {
b2 = fBuffer[in] & 0x00FF;
}
else {
b2 = fInputStream.read();
if (b2 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fOffset = 2;
return out - offset;
}
expectedByte(3, 3);
}
count++;
}
if ((b2 & 0xC0) != 0x80) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fBuffer[2] = (byte)b2;
fOffset = 3;
return out - offset;
}
invalidByte(3, 3, b2);
}
int c = ((b0 < < 12) & 0xF000) | ((b1 < < 6) & 0x0FC0) |
(b2 & 0x003F);
ch[out++] = (char)c;
count -= 2;
continue;
}
// UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
// Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
// [1101 11yy] [yyxx xxxx] (low surrogate)
// * uuuuu = wwww + 1
if ((b0 & 0xF8) == 0xF0) {
int b1 = -1;
if (++in < total) {
b1 = fBuffer[in] & 0x00FF;
}
else {
b1 = fInputStream.read();
if (b1 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fOffset = 1;
return out - offset;
}
expectedByte(2, 4);
}
count++;
}
if ((b1 & 0xC0) != 0x80
|| ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fOffset = 2;
return out - offset;
}
invalidByte(2, 4, b1);
}
int b2 = -1;
if (++in < total) {
b2 = fBuffer[in] & 0x00FF;
}
else {
b2 = fInputStream.read();
if (b2 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fOffset = 2;
return out - offset;
}
expectedByte(3, 4);
}
count++;
}
if ((b2 & 0xC0) != 0x80) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fBuffer[2] = (byte)b2;
fOffset = 3;
return out - offset;
}
invalidByte(3, 4, b2);
}
int b3 = -1;
if (++in < total) {
b3 = fBuffer[in] & 0x00FF;
}
else {
b3 = fInputStream.read();
if (b3 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fBuffer[2] = (byte)b2;
fOffset = 3;
return out - offset;
}
expectedByte(4, 4);
}
count++;
}
if ((b3 & 0xC0) != 0x80) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fBuffer[2] = (byte)b2;
fBuffer[3] = (byte)b3;
fOffset = 4;
return out - offset;
}
invalidByte(4, 4, b2);
}
// decode bytes into surrogate characters
int uuuuu = ((b0 < < 2) & 0x001C) | ((b1 > > 4) & 0x0003);
if (uuuuu > 0x10) {
invalidSurrogate(uuuuu);
}
int wwww = uuuuu - 1;
int zzzz = b1 & 0x000F;
int yyyyyy = b2 & 0x003F;
int xxxxxx = b3 & 0x003F;
int hs = 0xD800 | ((wwww < < 6) & 0x03C0) | (zzzz < < 2) | (yyyyyy > > 4);
int ls = 0xDC00 | ((yyyyyy < < 6) & 0x03C0) | xxxxxx;
// set characters
ch[out++] = (char)hs;
if ((count -= 2) < = length) {
ch[out++] = (char)ls;
}
// reached the end of the char buffer; save low surrogate for the next read
else {
fSurrogate = ls;
--count;
}
continue;
}
// error
if (out > offset) {
fBuffer[0] = (byte)b0;
fOffset = 1;
return out - offset;
}
invalidByte(1, 1, b0);
}
// return number of characters converted
if (DEBUG_READ) {
System.out.println("read(char[],"+offset+',"+length+"): count="+count);
}
return count;
}
Read characters into a portion of an array. This method will block
until some input is available, an I/O error occurs, or the end of the
stream is reached. |
public boolean ready() throws IOException {
return false;
}
Tell whether this stream is ready to be read. |
public void reset() throws IOException {
fOffset = 0;
fSurrogate = -1;
}
Reset the stream. If the stream has been marked, then attempt to
reposition it at the mark. If the stream has not been marked, then
attempt to reset it in some way appropriate to the particular stream,
for example by repositioning it to its starting point. Not all
character-input streams support the reset() operation, and some support
reset() without supporting mark(). |
public long skip(long n) throws IOException {
long remaining = n;
final char[] ch = new char[fBuffer.length];
do {
int length = ch.length < remaining ? ch.length : (int)remaining;
int count = read(ch, 0, length);
if (count > 0) {
remaining -= count;
}
else {
break;
}
} while (remaining > 0);
long skipped = n - remaining;
return skipped;
}
Skip characters. This method will block until some characters are
available, an I/O error occurs, or the end of the stream is reached. |