1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.fontbox.cmap; 18 19 import java.io.File; 20 import java.io.FileInputStream; 21 import java.io.InputStream; 22 import java.io.IOException; 23 import java.io.PushbackInputStream; 24 25 import java.util.ArrayList; 26 import java.util.HashMap; 27 import java.util.List; 28 import java.util.Map; 29 30 import org.apache.fontbox.util.ResourceLoader; 31 32 /** 33 * This will parser a CMap stream. 34 * 35 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> 36 * @version $Revision: 1.9 $ 37 */ 38 public class CMapParser 39 { 40 private static final String BEGIN_CODESPACE_RANGE = "begincodespacerange"; 41 private static final String BEGIN_BASE_FONT_CHAR = "beginbfchar"; 42 private static final String BEGIN_BASE_FONT_RANGE = "beginbfrange"; 43 private static final String USECMAP = "usecmap"; 44 45 private static final String MARK_END_OF_DICTIONARY = ">>"; 46 private static final String MARK_END_OF_ARRAY = "]"; 47 48 49 private byte[] tokenParserByteBuffer = new byte[512]; 50 51 /** 52 * Creates a new instance of CMapParser. 53 */ 54 public CMapParser() 55 { 56 } 57 58 /** 59 * Parse a CMAP file on the file system. 60 * 61 * @param file The file to parse. 62 * 63 * @return A parsed CMAP file. 64 * 65 * @throws IOException If there is an issue while parsing the CMAP. 66 */ 67 public CMap parse( File file ) throws IOException 68 { 69 String rootDir = file.getParent() + File.separator; 70 FileInputStream input = null; 71 try 72 { 73 input = new FileInputStream( file ); 74 return parse( rootDir, input ); 75 } 76 finally 77 { 78 if( input != null ) 79 { 80 input.close(); 81 } 82 } 83 84 } 85 86 /** 87 * This will parse the stream and create a cmap object. 88 * 89 * @param resourceRoot The root path to the cmap file. This will be used 90 * to find referenced cmap files. It can be null. 91 * @param input The CMAP stream to parse. 92 * 93 * @return The parsed stream as a java object. 94 * 95 * @throws IOException If there is an error parsing the stream. 96 */ 97 public CMap parse( String resourceRoot, InputStream input ) throws IOException 98 { 99 PushbackInputStream cmapStream = new PushbackInputStream( input ); 100 CMap result = new CMap(); 101 Object previousToken = null; 102 Object token = null; 103 while( (token = parseNextToken( cmapStream )) != null ) 104 { 105 if( token instanceof Operator ) 106 { 107 Operator op = (Operator)token; 108 if( op.op.equals( USECMAP ) ) 109 { 110 LiteralName useCmapName = (LiteralName)previousToken; 111 InputStream useStream = ResourceLoader.loadResource( resourceRoot + useCmapName.name ); 112 if( useStream == null ) 113 { 114 throw new IOException( "Error: Could not find referenced cmap stream " + useCmapName.name ); 115 } 116 CMap useCMap = parse( resourceRoot, useStream ); 117 result.useCmap( useCMap ); 118 } 119 else if( op.op.equals( BEGIN_CODESPACE_RANGE ) ) 120 { 121 Number cosCount = (Number)previousToken; 122 for( int j=0; j<cosCount.intValue(); j++ ) 123 { 124 byte[] startRange = (byte[])parseNextToken( cmapStream ); 125 byte[] endRange = (byte[])parseNextToken( cmapStream ); 126 CodespaceRange range = new CodespaceRange(); 127 range.setStart( startRange ); 128 range.setEnd( endRange ); 129 result.addCodespaceRange( range ); 130 } 131 } 132 else if( op.op.equals( BEGIN_BASE_FONT_CHAR ) ) 133 { 134 Number cosCount = (Number)previousToken; 135 for( int j=0; j<cosCount.intValue(); j++ ) 136 { 137 byte[] inputCode = (byte[])parseNextToken( cmapStream ); 138 Object nextToken = parseNextToken( cmapStream ); 139 if( nextToken instanceof byte[] ) 140 { 141 byte[] bytes = (byte[])nextToken; 142 String value = createStringFromBytes( bytes ); 143 result.addMapping( inputCode, value ); 144 } 145 else if( nextToken instanceof LiteralName ) 146 { 147 result.addMapping( inputCode, ((LiteralName)nextToken).name ); 148 } 149 else 150 { 151 throw new IOException( "Error parsing CMap beginbfchar, expected{COSString " + 152 "or COSName} and not " + nextToken ); 153 } 154 } 155 } 156 else if( op.op.equals( BEGIN_BASE_FONT_RANGE ) ) 157 { 158 Number cosCount = (Number)previousToken; 159 160 for( int j=0; j<cosCount.intValue(); j++ ) 161 { 162 byte[] startCode = (byte[])parseNextToken( cmapStream ); 163 byte[] endCode = (byte[])parseNextToken( cmapStream ); 164 Object nextToken = parseNextToken( cmapStream ); 165 List<byte[]> array = null; 166 byte[] tokenBytes = null; 167 if( nextToken instanceof List ) 168 { 169 array = (List)nextToken; 170 tokenBytes = (byte[])array.get( 0 ); 171 } 172 else 173 { 174 tokenBytes = (byte[])nextToken; 175 } 176 177 String value = null; 178 179 int arrayIndex = 0; 180 boolean done = false; 181 while( !done ) 182 { 183 if( compare( startCode, endCode ) >= 0 ) 184 { 185 done = true; 186 } 187 value = createStringFromBytes( tokenBytes ); 188 result.addMapping( startCode, value ); 189 increment( startCode ); 190 191 if( array == null ) 192 { 193 increment( tokenBytes ); 194 } 195 else 196 { 197 arrayIndex++; 198 if( arrayIndex < array.size() ) 199 { 200 tokenBytes = (byte[])array.get( arrayIndex ); 201 } 202 } 203 } 204 } 205 } 206 } 207 previousToken = token; 208 } 209 return result; 210 } 211 212 private Object parseNextToken( PushbackInputStream is ) throws IOException 213 { 214 Object retval = null; 215 int nextByte = is.read(); 216 //skip whitespace 217 while( nextByte == 0x09 || nextByte == 0x20 || nextByte == 0x0D || nextByte == 0x0A ) 218 { 219 nextByte = is.read(); 220 } 221 switch( nextByte ) 222 { 223 case '%': 224 { 225 //header operations, for now return the entire line 226 //may need to smarter in the future 227 StringBuffer buffer = new StringBuffer(); 228 buffer.append( (char)nextByte ); 229 readUntilEndOfLine( is, buffer ); 230 retval = buffer.toString(); 231 break; 232 } 233 case '(': 234 { 235 StringBuffer buffer = new StringBuffer(); 236 int stringByte = is.read(); 237 238 while( stringByte != -1 && stringByte != ')' ) 239 { 240 buffer.append( (char)stringByte ); 241 stringByte = is.read(); 242 } 243 retval = buffer.toString(); 244 break; 245 } 246 case '>': 247 { 248 int secondCloseBrace = is.read(); 249 if( secondCloseBrace == '>' ) 250 { 251 retval = MARK_END_OF_DICTIONARY; 252 } 253 else 254 { 255 throw new IOException( "Error: expected the end of a dictionary."); 256 } 257 break; 258 } 259 case ']': 260 { 261 retval = MARK_END_OF_ARRAY; 262 break; 263 } 264 case '[': 265 { 266 List<Object> list = new ArrayList<Object>(); 267 268 Object nextToken = parseNextToken( is ); 269 while( nextToken != null && nextToken != MARK_END_OF_ARRAY ) 270 { 271 list.add( nextToken ); 272 nextToken = parseNextToken( is ); 273 } 274 retval = list; 275 break; 276 } 277 case '<': 278 { 279 int theNextByte = is.read(); 280 if( theNextByte == '<' ) 281 { 282 Map<String,Object> result = new HashMap<String,Object>(); 283 //we are reading a dictionary 284 Object key = parseNextToken( is ); 285 while( key instanceof LiteralName && key != MARK_END_OF_DICTIONARY ) 286 { 287 Object value = parseNextToken( is ); 288 result.put( ((LiteralName)key).name, value ); 289 key = parseNextToken( is ); 290 } 291 retval = result; 292 } 293 else 294 { 295 //won't read more than 512 bytes 296 297 int multiplyer = 16; 298 int bufferIndex = -1; 299 while( theNextByte != -1 && theNextByte != '>' ) 300 { 301 int intValue = 0; 302 if( theNextByte >= '0' && theNextByte <= '9' ) 303 { 304 intValue = theNextByte - '0'; 305 } 306 else if( theNextByte >= 'A' && theNextByte <= 'F' ) 307 { 308 intValue = 10 + theNextByte - 'A'; 309 } 310 else if( theNextByte >= 'a' && theNextByte <= 'f' ) 311 { 312 intValue = 10 + theNextByte - 'a'; 313 } 314 else 315 { 316 throw new IOException( "Error: expected hex character and not " + 317 (char)theNextByte + ":" + theNextByte ); 318 } 319 intValue *= multiplyer; 320 if( multiplyer == 16 ) 321 { 322 bufferIndex++; 323 tokenParserByteBuffer[bufferIndex] = 0; 324 multiplyer = 1; 325 } 326 else 327 { 328 multiplyer = 16; 329 } 330 tokenParserByteBuffer[bufferIndex]+= intValue; 331 theNextByte = is.read(); 332 } 333 byte[] finalResult = new byte[bufferIndex+1]; 334 System.arraycopy(tokenParserByteBuffer,0,finalResult, 0, bufferIndex+1); 335 retval = finalResult; 336 } 337 break; 338 } 339 case '/': 340 { 341 StringBuffer buffer = new StringBuffer(); 342 int stringByte = is.read(); 343 344 while( !isWhitespaceOrEOF( stringByte ) ) 345 { 346 buffer.append( (char)stringByte ); 347 stringByte = is.read(); 348 } 349 retval = new LiteralName( buffer.toString() ); 350 break; 351 } 352 case -1: 353 { 354 //EOF return null; 355 break; 356 } 357 case '0': 358 case '1': 359 case '2': 360 case '3': 361 case '4': 362 case '5': 363 case '6': 364 case '7': 365 case '8': 366 case '9': 367 { 368 StringBuffer buffer = new StringBuffer(); 369 buffer.append( (char)nextByte ); 370 nextByte = is.read(); 371 372 while( !isWhitespaceOrEOF( nextByte ) && 373 (Character.isDigit( (char)nextByte )|| 374 nextByte == '.' ) ) 375 { 376 buffer.append( (char)nextByte ); 377 nextByte = is.read(); 378 } 379 is.unread( nextByte ); 380 String value = buffer.toString(); 381 if( value.indexOf( '.' ) >=0 ) 382 { 383 retval = new Double( value ); 384 } 385 else 386 { 387 retval = new Integer( buffer.toString() ); 388 } 389 break; 390 } 391 default: 392 { 393 StringBuffer buffer = new StringBuffer(); 394 buffer.append( (char)nextByte ); 395 nextByte = is.read(); 396 397 while( !isWhitespaceOrEOF( nextByte ) ) 398 { 399 buffer.append( (char)nextByte ); 400 nextByte = is.read(); 401 } 402 retval = new Operator( buffer.toString() ); 403 404 break; 405 } 406 } 407 return retval; 408 } 409 410 private void readUntilEndOfLine( InputStream is, StringBuffer buf ) throws IOException 411 { 412 int nextByte = is.read(); 413 while( nextByte != -1 && nextByte != 0x0D && nextByte != 0x0A ) 414 { 415 buf.append( (char)nextByte ); 416 nextByte = is.read(); 417 } 418 } 419 420 private boolean isWhitespaceOrEOF( int aByte ) 421 { 422 return aByte == -1 || aByte == 0x20 || aByte == 0x0D || aByte == 0x0A; 423 } 424 425 426 private void increment( byte[] data ) 427 { 428 increment( data, data.length-1 ); 429 } 430 431 private void increment( byte[] data, int position ) 432 { 433 if( position > 0 && (data[position]+256)%256 == 255 ) 434 { 435 data[position]=0; 436 increment( data, position-1); 437 } 438 else 439 { 440 data[position] = (byte)(data[position]+1); 441 } 442 } 443 444 private String createStringFromBytes( byte[] bytes ) throws IOException 445 { 446 String retval = null; 447 if( bytes.length == 1 ) 448 { 449 retval = new String( bytes ); 450 } 451 else 452 { 453 retval = new String( bytes, "UTF-16BE" ); 454 } 455 return retval; 456 } 457 458 private int compare( byte[] first, byte[] second ) 459 { 460 int retval = 1; 461 boolean done = false; 462 for( int i=0; i<first.length && !done; i++ ) 463 { 464 if( first[i] == second[i] ) 465 { 466 //move to next position 467 } 468 else if( ((first[i]+256)%256) < ((second[i]+256)%256) ) 469 { 470 done = true; 471 retval = -1; 472 } 473 else 474 { 475 done = true; 476 retval = 1; 477 } 478 } 479 return retval; 480 } 481 482 /** 483 * Internal class. 484 */ 485 private class LiteralName 486 { 487 private String name; 488 private LiteralName( String theName ) 489 { 490 name = theName; 491 } 492 } 493 494 /** 495 * Internal class. 496 */ 497 private class Operator 498 { 499 private String op; 500 private Operator( String theOp ) 501 { 502 op = theOp; 503 } 504 } 505 506 /** 507 * A simple class to test parsing of cmap files. 508 * 509 * @param args Some command line arguments. 510 * 511 * @throws Exception If there is an error parsing the file. 512 */ 513 public static void main( String[] args ) throws Exception 514 { 515 if( args.length != 1 ) 516 { 517 System.err.println( "usage: java org.pdfbox.cmapparser.CMapParser <CMAP File>" ); 518 System.exit( -1 ); 519 } 520 CMapParser parser = new CMapParser( ); 521 File cmapFile = new File( args[0] ); 522 CMap result = parser.parse( cmapFile ); 523 System.out.println( "Result:" + result ); 524 } 525 }