1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.xerces.impl.io; 19 20 import java.io.InputStream; 21 import java.io.IOException; 22 import java.io.Reader; 23 24 /** 25 * Reader for UCS-2 and UCS-4 encodings. 26 * (i.e., encodings from ISO-10646-UCS-(2|4)). 27 * 28 * @xerces.internal 29 * 30 * @author Neil Graham, IBM 31 * 32 * @version $Id: UCSReader.java 449317 2006-09-23 22:12:30Z mrglavas $ 33 */ 34 public class UCSReader extends Reader { 35 36 // 37 // Constants 38 // 39 40 /** 41 * Default byte buffer size (8192, larger than that of ASCIIReader 42 * since it's reasonable to surmise that the average UCS-4-encoded 43 * file should be 4 times as large as the average ASCII-encoded file). 44 */ 45 public static final int DEFAULT_BUFFER_SIZE = 8192; 46 47 public static final short UCS2LE = 1; 48 public static final short UCS2BE = 2; 49 public static final short UCS4LE = 4; 50 public static final short UCS4BE = 8; 51 52 // 53 // Data 54 // 55 56 /** Input stream. */ 57 protected final InputStream fInputStream; 58 59 /** Byte buffer. */ 60 protected final byte[] fBuffer; 61 62 // what kind of data we're dealing with 63 protected final short fEncoding; 64 65 // 66 // Constructors 67 // 68 69 /** 70 * Constructs a UCS reader from the specified input stream 71 * using the default buffer size. The Endian-ness and whether this is 72 * UCS-2 or UCS-4 needs also to be known in advance. 73 * 74 * @param inputStream The input stream. 75 * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE. 76 */ 77 public UCSReader(InputStream inputStream, short encoding) { 78 this(inputStream, DEFAULT_BUFFER_SIZE, encoding); 79 } // <init>(InputStream, short) 80 81 /** 82 * Constructs a UCS reader from the specified input stream 83 * and buffer size. The Endian-ness and whether this is 84 * UCS-2 or UCS-4 needs also to be known in advance. 85 * 86 * @param inputStream The input stream. 87 * @param size The initial buffer size. 88 * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE. 89 */ 90 public UCSReader(InputStream inputStream, int size, short encoding) { 91 this(inputStream, new byte[size], encoding); 92 } // <init>(InputStream,int,short) 93 94 /** 95 * Constructs a UCS reader from the specified input stream 96 * and buffer. The Endian-ness and whether this is 97 * UCS-2 or UCS-4 needs also to be known in advance. 98 * 99 * @param inputStream The input stream. 100 * @param buffer The byte buffer. 101 * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE. 102 */ 103 public UCSReader(InputStream inputStream, byte [] buffer, short encoding) { 104 fInputStream = inputStream; 105 fBuffer = buffer; 106 fEncoding = encoding; 107 } // <init>(InputStream,int,short) 108 109 // 110 // Reader methods 111 // 112 113 /** 114 * Read a single character. This method will block until a character is 115 * available, an I/O error occurs, or the end of the stream is reached. 116 * 117 * <p> Subclasses that intend to support efficient single-character input 118 * should override this method. 119 * 120 * @return The character read, as an integer in the range 0 to 127 121 * (<tt>0x00-0x7f</tt>), or -1 if the end of the stream has 122 * been reached 123 * 124 * @exception IOException If an I/O error occurs 125 */ 126 public int read() throws IOException { 127 int b0 = fInputStream.read() & 0xff; 128 if (b0 == 0xff) { 129 return -1; 130 } 131 int b1 = fInputStream.read() & 0xff; 132 if (b1 == 0xff) { 133 return -1; 134 } 135 // UCS-4 136 if (fEncoding >= 4) { 137 int b2 = fInputStream.read() & 0xff; 138 if (b2 == 0xff) { 139 return -1; 140 } 141 int b3 = fInputStream.read() & 0xff; 142 if (b3 == 0xff) { 143 return -1; 144 } 145 if (fEncoding == UCS4BE) { 146 return (b0<<24)+(b1<<16)+(b2<<8)+b3; 147 } 148 return (b3<<24)+(b2<<16)+(b1<<8)+b0; 149 } 150 // UCS-2 151 if (fEncoding == UCS2BE) { 152 return (b0<<8)+b1; 153 } 154 return (b1<<8)+b0; 155 } // read():int 156 157 /** 158 * Read characters into a portion of an array. This method will block 159 * until some input is available, an I/O error occurs, or the end of the 160 * stream is reached. 161 * 162 * @param ch Destination buffer 163 * @param offset Offset at which to start storing characters 164 * @param length Maximum number of characters to read 165 * 166 * @return The number of characters read, or -1 if the end of the 167 * stream has been reached 168 * 169 * @exception IOException If an I/O error occurs 170 */ 171 public int read(char ch[], int offset, int length) throws IOException { 172 int byteLength = length << ((fEncoding >= 4)?2:1); 173 if (byteLength > fBuffer.length) { 174 byteLength = fBuffer.length; 175 } 176 int count = fInputStream.read(fBuffer, 0, byteLength); 177 if (count == -1) return -1; 178 // try and make count be a multiple of the number of bytes we're looking for 179 if (fEncoding >= 4) { // BigEndian 180 // this looks ugly, but it avoids an if at any rate... 181 int numToRead = (4 - (count & 3) & 3); 182 for (int i = 0; i < numToRead; i++) { 183 int charRead = fInputStream.read(); 184 if (charRead == -1) { // end of input; something likely went wrong!A Pad buffer with nulls. 185 for (int j = i; j < numToRead; j++) { 186 fBuffer[count+j] = 0; 187 } 188 break; 189 } 190 fBuffer[count+i] = (byte)charRead; 191 } 192 count += numToRead; 193 } 194 else { 195 int numToRead = count & 1; 196 if (numToRead != 0) { 197 count++; 198 int charRead = fInputStream.read(); 199 if (charRead == -1) { // end of input; something likely went wrong!A Pad buffer with nulls. 200 fBuffer[count] = 0; 201 } 202 else { 203 fBuffer[count] = (byte)charRead; 204 } 205 } 206 } 207 208 // now count is a multiple of the right number of bytes 209 int numChars = count >> ((fEncoding >= 4)?2:1); 210 int curPos = 0; 211 for (int i = 0; i < numChars; i++) { 212 int b0 = fBuffer[curPos++] & 0xff; 213 int b1 = fBuffer[curPos++] & 0xff; 214 // UCS-4 215 if (fEncoding >= 4) { 216 int b2 = fBuffer[curPos++] & 0xff; 217 int b3 = fBuffer[curPos++] & 0xff; 218 if (fEncoding == UCS4BE) { 219 ch[offset+i] = (char)((b0<<24)+(b1<<16)+(b2<<8)+b3); 220 } 221 else { 222 ch[offset+i] = (char)((b3<<24)+(b2<<16)+(b1<<8)+b0); 223 } 224 } 225 else { // UCS-2 226 if (fEncoding == UCS2BE) { 227 ch[offset+i] = (char)((b0<<8)+b1); 228 } 229 else { 230 ch[offset+i] = (char)((b1<<8)+b0); 231 } 232 } 233 } 234 return numChars; 235 } // read(char[],int,int) 236 237 /** 238 * Skip characters. This method will block until some characters are 239 * available, an I/O error occurs, or the end of the stream is reached. 240 * 241 * @param n The number of characters to skip 242 * 243 * @return The number of characters actually skipped 244 * 245 * @exception IOException If an I/O error occurs 246 */ 247 public long skip(long n) throws IOException { 248 // charWidth will represent the number of bits to move 249 // n leftward to get num of bytes to skip, and then move the result rightward 250 // to get num of chars effectively skipped. 251 // The trick with &'ing, as with elsewhere in this dcode, is 252 // intended to avoid an expensive use of / that might not be optimized 253 // away. 254 int charWidth = (fEncoding >=4)?2:1; 255 long bytesSkipped = fInputStream.skip(n<<charWidth); 256 if((bytesSkipped & (charWidth | 1)) == 0) return bytesSkipped >> charWidth; 257 return (bytesSkipped >> charWidth) + 1; 258 } // skip(long):long 259 260 /** 261 * Tell whether this stream is ready to be read. 262 * 263 * @return True if the next read() is guaranteed not to block for input, 264 * false otherwise. Note that returning false does not guarantee that the 265 * next read will block. 266 * 267 * @exception IOException If an I/O error occurs 268 */ 269 public boolean ready() throws IOException { 270 return false; 271 } // ready() 272 273 /** 274 * Tell whether this stream supports the mark() operation. 275 */ 276 public boolean markSupported() { 277 return fInputStream.markSupported(); 278 } // markSupported() 279 280 /** 281 * Mark the present position in the stream. Subsequent calls to reset() 282 * will attempt to reposition the stream to this point. Not all 283 * character-input streams support the mark() operation. 284 * 285 * @param readAheadLimit Limit on the number of characters that may be 286 * read while still preserving the mark. After 287 * reading this many characters, attempting to 288 * reset the stream may fail. 289 * 290 * @exception IOException If the stream does not support mark(), 291 * or if some other I/O error occurs 292 */ 293 public void mark(int readAheadLimit) throws IOException { 294 fInputStream.mark(readAheadLimit); 295 } // mark(int) 296 297 /** 298 * Reset the stream. If the stream has been marked, then attempt to 299 * reposition it at the mark. If the stream has not been marked, then 300 * attempt to reset it in some way appropriate to the particular stream, 301 * for example by repositioning it to its starting point. Not all 302 * character-input streams support the reset() operation, and some support 303 * reset() without supporting mark(). 304 * 305 * @exception IOException If the stream has not been marked, 306 * or if the mark has been invalidated, 307 * or if the stream does not support reset(), 308 * or if some other I/O error occurs 309 */ 310 public void reset() throws IOException { 311 fInputStream.reset(); 312 } // reset() 313 314 /** 315 * Close the stream. Once a stream has been closed, further read(), 316 * ready(), mark(), or reset() invocations will throw an IOException. 317 * Closing a previously-closed stream, however, has no effect. 318 * 319 * @exception IOException If an I/O error occurs 320 */ 321 public void close() throws IOException { 322 fInputStream.close(); 323 } // close() 324 325 } // class UCSReader