1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.xerces.impl.io; 19 20 import java.io.InputStream; 21 import java.io.IOException; 22 import java.io.Reader; 23 24 import java.util.Locale; 25 import org.apache.xerces.util.MessageFormatter; 26 import org.apache.xerces.impl.msg.XMLMessageFormatter; 27 28 /** 29 * <p>A UTF-8 reader.</p> 30 * 31 * @xerces.internal 32 * 33 * @author Andy Clark, IBM 34 * 35 * @version $Id: UTF8Reader.java 554069 2007-07-06 21:56:14Z mrglavas $ 36 */ 37 public class UTF8Reader 38 extends Reader { 39 40 // 41 // Constants 42 // 43 44 /** Default byte buffer size (2048). */ 45 public static final int DEFAULT_BUFFER_SIZE = 2048; 46 47 // debugging 48 49 /** Debug read. */ 50 private static final boolean DEBUG_READ = false; 51 52 // 53 // Data 54 // 55 56 /** Input stream. */ 57 protected final InputStream fInputStream; 58 59 /** Byte buffer. */ 60 protected final byte[] fBuffer; 61 62 /** Offset into buffer. */ 63 protected int fOffset; 64 65 /** Surrogate character. */ 66 private int fSurrogate = -1; 67 68 // message formatter; used to produce localized 69 // exception messages 70 private final MessageFormatter fFormatter; 71 72 //Locale to use for messages 73 private final Locale fLocale; 74 75 // 76 // Constructors 77 // 78 79 /** 80 * Constructs a UTF-8 reader from the specified input stream 81 * using the default buffer size. Primarily for testing. 82 * 83 * @param inputStream The input stream. 84 */ 85 public UTF8Reader(InputStream inputStream) { 86 this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault()); 87 } // <init>(InputStream, MessageFormatter) 88 89 /** 90 * Constructs a UTF-8 reader from the specified input stream 91 * using the default buffer size and the given MessageFormatter. 92 * 93 * @param inputStream The input stream. 94 * @param messageFormatter given MessageFormatter 95 * @param locale Locale to use for messages 96 */ 97 public UTF8Reader(InputStream inputStream, MessageFormatter messageFormatter, 98 Locale locale) { 99 this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale); 100 } // <init>(InputStream, MessageFormatter, Locale) 101 102 /** 103 * Constructs a UTF-8 reader from the specified input stream, 104 * buffer size and MessageFormatter. 105 * 106 * @param inputStream The input stream. 107 * @param size The initial buffer size. 108 * @param messageFormatter the formatter for localizing/formatting errors. 109 * @param locale the Locale to use for messages 110 */ 111 public UTF8Reader(InputStream inputStream, int size, 112 MessageFormatter messageFormatter, Locale locale) { 113 this(inputStream, new byte[size], messageFormatter, locale); 114 } // <init>(InputStream, int, MessageFormatter, Locale) 115 116 /** 117 * Constructs a UTF-8 reader from the specified input stream, 118 * buffer and MessageFormatter. 119 * 120 * @param inputStream The input stream. 121 * @param buffer The byte buffer. 122 * @param messageFormatter the formatter for localizing/formatting errors. 123 * @param locale the Locale to use for messages 124 */ 125 public UTF8Reader(InputStream inputStream, byte [] buffer, 126 MessageFormatter messageFormatter, Locale locale) { 127 fInputStream = inputStream; 128 fBuffer = buffer; 129 fFormatter = messageFormatter; 130 fLocale = locale; 131 } // <init>(InputStream, byte[], MessageFormatter, Locale) 132 133 // 134 // Reader methods 135 // 136 137 /** 138 * Read a single character. This method will block until a character is 139 * available, an I/O error occurs, or the end of the stream is reached. 140 * 141 * <p> Subclasses that intend to support efficient single-character input 142 * should override this method. 143 * 144 * @return The character read, as an integer in the range 0 to 16383 145 * (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has 146 * been reached 147 * 148 * @exception IOException If an I/O error occurs 149 */ 150 public int read() throws IOException { 151 152 // decode character 153 int c = fSurrogate; 154 if (fSurrogate == -1) { 155 // NOTE: We use the index into the buffer if there are remaining 156 // bytes from the last block read. -Ac 157 int index = 0; 158 159 // get first byte 160 int b0 = index == fOffset 161 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 162 if (b0 == -1) { 163 return -1; 164 } 165 166 // UTF-8: [0xxx xxxx] 167 // Unicode: [0000 0000] [0xxx xxxx] 168 if (b0 < 0x80) { 169 c = (char)b0; 170 } 171 172 // UTF-8: [110y yyyy] [10xx xxxx] 173 // Unicode: [0000 0yyy] [yyxx xxxx] 174 else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) { 175 int b1 = index == fOffset 176 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 177 if (b1 == -1) { 178 expectedByte(2, 2); 179 } 180 if ((b1 & 0xC0) != 0x80) { 181 invalidByte(2, 2, b1); 182 } 183 c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F); 184 } 185 186 // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx] 187 // Unicode: [zzzz yyyy] [yyxx xxxx] 188 else if ((b0 & 0xF0) == 0xE0) { 189 int b1 = index == fOffset 190 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 191 if (b1 == -1) { 192 expectedByte(2, 3); 193 } 194 if ((b1 & 0xC0) != 0x80 195 || (b0 == 0xED && b1 >= 0xA0) 196 || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) { 197 invalidByte(2, 3, b1); 198 } 199 int b2 = index == fOffset 200 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 201 if (b2 == -1) { 202 expectedByte(3, 3); 203 } 204 if ((b2 & 0xC0) != 0x80) { 205 invalidByte(3, 3, b2); 206 } 207 c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) | 208 (b2 & 0x003F); 209 } 210 211 // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* 212 // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) 213 // [1101 11yy] [yyxx xxxx] (low surrogate) 214 // * uuuuu = wwww + 1 215 else if ((b0 & 0xF8) == 0xF0) { 216 int b1 = index == fOffset 217 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 218 if (b1 == -1) { 219 expectedByte(2, 4); 220 } 221 if ((b1 & 0xC0) != 0x80 222 || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) { 223 invalidByte(2, 3, b1); 224 } 225 int b2 = index == fOffset 226 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 227 if (b2 == -1) { 228 expectedByte(3, 4); 229 } 230 if ((b2 & 0xC0) != 0x80) { 231 invalidByte(3, 3, b2); 232 } 233 int b3 = index == fOffset 234 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 235 if (b3 == -1) { 236 expectedByte(4, 4); 237 } 238 if ((b3 & 0xC0) != 0x80) { 239 invalidByte(4, 4, b3); 240 } 241 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003); 242 if (uuuuu > 0x10) { 243 invalidSurrogate(uuuuu); 244 } 245 int wwww = uuuuu - 1; 246 int hs = 0xD800 | 247 ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) | 248 ((b2 >> 4) & 0x0003); 249 int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F); 250 c = hs; 251 fSurrogate = ls; 252 } 253 254 // error 255 else { 256 invalidByte(1, 1, b0); 257 } 258 } 259 260 // use surrogate 261 else { 262 fSurrogate = -1; 263 } 264 265 // return character 266 if (DEBUG_READ) { 267 System.out.println("read(): 0x"+Integer.toHexString(c)); 268 } 269 return c; 270 271 } // read():int 272 273 /** 274 * Read characters into a portion of an array. This method will block 275 * until some input is available, an I/O error occurs, or the end of the 276 * stream is reached. 277 * 278 * @param ch Destination buffer 279 * @param offset Offset at which to start storing characters 280 * @param length Maximum number of characters to read 281 * 282 * @return The number of characters read, or -1 if the end of the 283 * stream has been reached 284 * 285 * @exception IOException If an I/O error occurs 286 */ 287 public int read(char ch[], int offset, int length) throws IOException { 288 289 // read bytes 290 int out = offset; 291 int count = 0; 292 if (fOffset == 0) { 293 // adjust length to read 294 if (length > fBuffer.length) { 295 length = fBuffer.length; 296 } 297 298 // handle surrogate 299 if (fSurrogate != -1) { 300 ch[out++] = (char)fSurrogate; 301 fSurrogate = -1; 302 length--; 303 } 304 305 // perform read operation 306 count = fInputStream.read(fBuffer, 0, length); 307 if (count == -1) { 308 return -1; 309 } 310 count += out - offset; 311 } 312 313 // skip read; last character was in error 314 // NOTE: Having an offset value other than zero means that there was 315 // an error in the last character read. In this case, we have 316 // skipped the read so we don't consume any bytes past the 317 // error. By signalling the error on the next block read we 318 // allow the method to return the most valid characters that 319 // it can on the previous block read. -Ac 320 else { 321 count = fOffset; 322 fOffset = 0; 323 } 324 325 // convert bytes to characters 326 final int total = count; 327 int in; 328 byte byte1; 329 final byte byte0 = 0; 330 for (in = 0; in < total; in++) { 331 byte1 = fBuffer[in]; 332 if (byte1 >= byte0) { 333 ch[out++] = (char)byte1; 334 } 335 else { 336 break; 337 } 338 } 339 for ( ; in < total; in++) { 340 byte1 = fBuffer[in]; 341 342 // UTF-8: [0xxx xxxx] 343 // Unicode: [0000 0000] [0xxx xxxx] 344 if (byte1 >= byte0) { 345 ch[out++] = (char)byte1; 346 continue; 347 } 348 349 // UTF-8: [110y yyyy] [10xx xxxx] 350 // Unicode: [0000 0yyy] [yyxx xxxx] 351 int b0 = byte1 & 0x0FF; 352 if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) { 353 int b1 = -1; 354 if (++in < total) { 355 b1 = fBuffer[in] & 0x00FF; 356 } 357 else { 358 b1 = fInputStream.read(); 359 if (b1 == -1) { 360 if (out > offset) { 361 fBuffer[0] = (byte)b0; 362 fOffset = 1; 363 return out - offset; 364 } 365 expectedByte(2, 2); 366 } 367 count++; 368 } 369 if ((b1 & 0xC0) != 0x80) { 370 if (out > offset) { 371 fBuffer[0] = (byte)b0; 372 fBuffer[1] = (byte)b1; 373 fOffset = 2; 374 return out - offset; 375 } 376 invalidByte(2, 2, b1); 377 } 378 int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F); 379 ch[out++] = (char)c; 380 count -= 1; 381 continue; 382 } 383 384 // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx] 385 // Unicode: [zzzz yyyy] [yyxx xxxx] 386 if ((b0 & 0xF0) == 0xE0) { 387 int b1 = -1; 388 if (++in < total) { 389 b1 = fBuffer[in] & 0x00FF; 390 } 391 else { 392 b1 = fInputStream.read(); 393 if (b1 == -1) { 394 if (out > offset) { 395 fBuffer[0] = (byte)b0; 396 fOffset = 1; 397 return out - offset; 398 } 399 expectedByte(2, 3); 400 } 401 count++; 402 } 403 if ((b1 & 0xC0) != 0x80 404 || (b0 == 0xED && b1 >= 0xA0) 405 || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) { 406 if (out > offset) { 407 fBuffer[0] = (byte)b0; 408 fBuffer[1] = (byte)b1; 409 fOffset = 2; 410 return out - offset; 411 } 412 invalidByte(2, 3, b1); 413 } 414 int b2 = -1; 415 if (++in < total) { 416 b2 = fBuffer[in] & 0x00FF; 417 } 418 else { 419 b2 = fInputStream.read(); 420 if (b2 == -1) { 421 if (out > offset) { 422 fBuffer[0] = (byte)b0; 423 fBuffer[1] = (byte)b1; 424 fOffset = 2; 425 return out - offset; 426 } 427 expectedByte(3, 3); 428 } 429 count++; 430 } 431 if ((b2 & 0xC0) != 0x80) { 432 if (out > offset) { 433 fBuffer[0] = (byte)b0; 434 fBuffer[1] = (byte)b1; 435 fBuffer[2] = (byte)b2; 436 fOffset = 3; 437 return out - offset; 438 } 439 invalidByte(3, 3, b2); 440 } 441 int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) | 442 (b2 & 0x003F); 443 ch[out++] = (char)c; 444 count -= 2; 445 continue; 446 } 447 448 // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* 449 // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) 450 // [1101 11yy] [yyxx xxxx] (low surrogate) 451 // * uuuuu = wwww + 1 452 if ((b0 & 0xF8) == 0xF0) { 453 int b1 = -1; 454 if (++in < total) { 455 b1 = fBuffer[in] & 0x00FF; 456 } 457 else { 458 b1 = fInputStream.read(); 459 if (b1 == -1) { 460 if (out > offset) { 461 fBuffer[0] = (byte)b0; 462 fOffset = 1; 463 return out - offset; 464 } 465 expectedByte(2, 4); 466 } 467 count++; 468 } 469 if ((b1 & 0xC0) != 0x80 470 || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) { 471 if (out > offset) { 472 fBuffer[0] = (byte)b0; 473 fBuffer[1] = (byte)b1; 474 fOffset = 2; 475 return out - offset; 476 } 477 invalidByte(2, 4, b1); 478 } 479 int b2 = -1; 480 if (++in < total) { 481 b2 = fBuffer[in] & 0x00FF; 482 } 483 else { 484 b2 = fInputStream.read(); 485 if (b2 == -1) { 486 if (out > offset) { 487 fBuffer[0] = (byte)b0; 488 fBuffer[1] = (byte)b1; 489 fOffset = 2; 490 return out - offset; 491 } 492 expectedByte(3, 4); 493 } 494 count++; 495 } 496 if ((b2 & 0xC0) != 0x80) { 497 if (out > offset) { 498 fBuffer[0] = (byte)b0; 499 fBuffer[1] = (byte)b1; 500 fBuffer[2] = (byte)b2; 501 fOffset = 3; 502 return out - offset; 503 } 504 invalidByte(3, 4, b2); 505 } 506 int b3 = -1; 507 if (++in < total) { 508 b3 = fBuffer[in] & 0x00FF; 509 } 510 else { 511 b3 = fInputStream.read(); 512 if (b3 == -1) { 513 if (out > offset) { 514 fBuffer[0] = (byte)b0; 515 fBuffer[1] = (byte)b1; 516 fBuffer[2] = (byte)b2; 517 fOffset = 3; 518 return out - offset; 519 } 520 expectedByte(4, 4); 521 } 522 count++; 523 } 524 if ((b3 & 0xC0) != 0x80) { 525 if (out > offset) { 526 fBuffer[0] = (byte)b0; 527 fBuffer[1] = (byte)b1; 528 fBuffer[2] = (byte)b2; 529 fBuffer[3] = (byte)b3; 530 fOffset = 4; 531 return out - offset; 532 } 533 invalidByte(4, 4, b2); 534 } 535 536 // decode bytes into surrogate characters 537 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003); 538 if (uuuuu > 0x10) { 539 invalidSurrogate(uuuuu); 540 } 541 int wwww = uuuuu - 1; 542 int zzzz = b1 & 0x000F; 543 int yyyyyy = b2 & 0x003F; 544 int xxxxxx = b3 & 0x003F; 545 int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2) | (yyyyyy >> 4); 546 int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; 547 548 // set characters 549 ch[out++] = (char)hs; 550 if ((count -= 2) <= length) { 551 ch[out++] = (char)ls; 552 } 553 // reached the end of the char buffer; save low surrogate for the next read 554 else { 555 fSurrogate = ls; 556 --count; 557 } 558 continue; 559 } 560 561 // error 562 if (out > offset) { 563 fBuffer[0] = (byte)b0; 564 fOffset = 1; 565 return out - offset; 566 } 567 invalidByte(1, 1, b0); 568 } 569 570 // return number of characters converted 571 if (DEBUG_READ) { 572 System.out.println("read(char[],"+offset+','+length+"): count="+count); 573 } 574 return count; 575 576 } // read(char[],int,int) 577 578 /** 579 * Skip characters. This method will block until some characters are 580 * available, an I/O error occurs, or the end of the stream is reached. 581 * 582 * @param n The number of characters to skip 583 * 584 * @return The number of characters actually skipped 585 * 586 * @exception IOException If an I/O error occurs 587 */ 588 public long skip(long n) throws IOException { 589 590 long remaining = n; 591 final char[] ch = new char[fBuffer.length]; 592 do { 593 int length = ch.length < remaining ? ch.length : (int)remaining; 594 int count = read(ch, 0, length); 595 if (count > 0) { 596 remaining -= count; 597 } 598 else { 599 break; 600 } 601 } while (remaining > 0); 602 603 long skipped = n - remaining; 604 return skipped; 605 606 } // skip(long):long 607 608 /** 609 * Tell whether this stream is ready to be read. 610 * 611 * @return True if the next read() is guaranteed not to block for input, 612 * false otherwise. Note that returning false does not guarantee that the 613 * next read will block. 614 * 615 * @exception IOException If an I/O error occurs 616 */ 617 public boolean ready() throws IOException { 618 return false; 619 } // ready() 620 621 /** 622 * Tell whether this stream supports the mark() operation. 623 */ 624 public boolean markSupported() { 625 return false; 626 } // markSupported() 627 628 /** 629 * Mark the present position in the stream. Subsequent calls to reset() 630 * will attempt to reposition the stream to this point. Not all 631 * character-input streams support the mark() operation. 632 * 633 * @param readAheadLimit Limit on the number of characters that may be 634 * read while still preserving the mark. After 635 * reading this many characters, attempting to 636 * reset the stream may fail. 637 * 638 * @exception IOException If the stream does not support mark(), 639 * or if some other I/O error occurs 640 */ 641 public void mark(int readAheadLimit) throws IOException { 642 throw new IOException(fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object[]{"mark()", "UTF-8"})); 643 } // mark(int) 644 645 /** 646 * Reset the stream. If the stream has been marked, then attempt to 647 * reposition it at the mark. If the stream has not been marked, then 648 * attempt to reset it in some way appropriate to the particular stream, 649 * for example by repositioning it to its starting point. Not all 650 * character-input streams support the reset() operation, and some support 651 * reset() without supporting mark(). 652 * 653 * @exception IOException If the stream has not been marked, 654 * or if the mark has been invalidated, 655 * or if the stream does not support reset(), 656 * or if some other I/O error occurs 657 */ 658 public void reset() throws IOException { 659 fOffset = 0; 660 fSurrogate = -1; 661 } // reset() 662 663 /** 664 * Close the stream. Once a stream has been closed, further read(), 665 * ready(), mark(), or reset() invocations will throw an IOException. 666 * Closing a previously-closed stream, however, has no effect. 667 * 668 * @exception IOException If an I/O error occurs 669 */ 670 public void close() throws IOException { 671 fInputStream.close(); 672 } // close() 673 674 // 675 // Private methods 676 // 677 678 /** Throws an exception for expected byte. */ 679 private void expectedByte(int position, int count) 680 throws MalformedByteSequenceException { 681 682 throw new MalformedByteSequenceException(fFormatter, 683 fLocale, 684 XMLMessageFormatter.XML_DOMAIN, 685 "ExpectedByte", 686 new Object[] {Integer.toString(position), Integer.toString(count)}); 687 688 } // expectedByte(int,int) 689 690 /** Throws an exception for invalid byte. */ 691 private void invalidByte(int position, int count, int c) 692 throws MalformedByteSequenceException { 693 694 throw new MalformedByteSequenceException(fFormatter, 695 fLocale, 696 XMLMessageFormatter.XML_DOMAIN, 697 "InvalidByte", 698 new Object [] {Integer.toString(position), Integer.toString(count)}); 699 700 } // invalidByte(int,int,int) 701 702 /** Throws an exception for invalid surrogate bits. */ 703 private void invalidSurrogate(int uuuuu) throws MalformedByteSequenceException { 704 705 throw new MalformedByteSequenceException(fFormatter, 706 fLocale, 707 XMLMessageFormatter.XML_DOMAIN, 708 "InvalidHighSurrogate", 709 new Object[] {Integer.toHexString(uuuuu)}); 710 711 } // invalidSurrogate(int) 712 713 } // class UTF8Reader