Home » pdfbox-1.1.0-src » org.apache.fontbox.cmap » [javadoc | source]

    1   /*
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *      http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   package org.apache.fontbox.cmap;
   18   
   19   import java.io.File;
   20   import java.io.FileInputStream;
   21   import java.io.InputStream;
   22   import java.io.IOException;
   23   import java.io.PushbackInputStream;
   24   
   25   import java.util.ArrayList;
   26   import java.util.HashMap;
   27   import java.util.List;
   28   import java.util.Map;
   29   
   30   import org.apache.fontbox.util.ResourceLoader;
   31   
   32   /**
   33    * This will parser a CMap stream.
   34    *
   35    * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
   36    * @version $Revision: 1.9 $
   37    */
   38   public class CMapParser
   39   {
   40       private static final String BEGIN_CODESPACE_RANGE = "begincodespacerange";
   41       private static final String BEGIN_BASE_FONT_CHAR = "beginbfchar";
   42       private static final String BEGIN_BASE_FONT_RANGE = "beginbfrange";
   43       private static final String USECMAP = "usecmap";
   44       
   45       private static final String MARK_END_OF_DICTIONARY = ">>";
   46       private static final String MARK_END_OF_ARRAY = "]";
   47       
   48       
   49       private byte[] tokenParserByteBuffer = new byte[512];
   50   
   51       /**
   52        * Creates a new instance of CMapParser.
   53        */
   54       public CMapParser()
   55       {
   56       }
   57       
   58       /**
   59        * Parse a CMAP file on the file system.
   60        * 
   61        * @param file The file to parse.
   62        * 
   63        * @return A parsed CMAP file.
   64        * 
   65        * @throws IOException If there is an issue while parsing the CMAP.
   66        */
   67       public CMap parse( File file ) throws IOException
   68       {
   69           String rootDir = file.getParent() + File.separator;
   70           FileInputStream input = null;
   71           try
   72           {
   73               input = new FileInputStream( file );
   74               return parse( rootDir, input );
   75           }
   76           finally
   77           {
   78               if( input != null )
   79               {
   80                   input.close();
   81               }
   82           }
   83           
   84       }
   85   
   86       /**
   87        * This will parse the stream and create a cmap object.
   88        *
   89        * @param resourceRoot The root path to the cmap file.  This will be used
   90        *                     to find referenced cmap files.  It can be null.
   91        * @param input The CMAP stream to parse.
   92        * 
   93        * @return The parsed stream as a java object.
   94        *
   95        * @throws IOException If there is an error parsing the stream.
   96        */
   97       public CMap parse( String resourceRoot, InputStream input ) throws IOException
   98       {
   99           PushbackInputStream cmapStream = new PushbackInputStream( input );
  100           CMap result = new CMap();
  101           Object previousToken = null;
  102           Object token = null;
  103           while( (token = parseNextToken( cmapStream )) != null )
  104           {
  105               if( token instanceof Operator )
  106               {
  107                   Operator op = (Operator)token;
  108                   if( op.op.equals( USECMAP ) )
  109                   {
  110                       LiteralName useCmapName = (LiteralName)previousToken;
  111                       InputStream useStream = ResourceLoader.loadResource( resourceRoot + useCmapName.name );
  112                       if( useStream == null )
  113                       {
  114                           throw new IOException( "Error: Could not find referenced cmap stream " + useCmapName.name );
  115                       }
  116                       CMap useCMap = parse( resourceRoot, useStream );
  117                       result.useCmap( useCMap );
  118                   }
  119                   else if( op.op.equals( BEGIN_CODESPACE_RANGE ) )
  120                   {
  121                       Number cosCount = (Number)previousToken;
  122                       for( int j=0; j<cosCount.intValue(); j++ )
  123                       {
  124                           byte[] startRange = (byte[])parseNextToken( cmapStream );
  125                           byte[] endRange = (byte[])parseNextToken( cmapStream );
  126                           CodespaceRange range = new CodespaceRange();
  127                           range.setStart( startRange );
  128                           range.setEnd( endRange );
  129                           result.addCodespaceRange( range );
  130                       }
  131                   }
  132                   else if( op.op.equals( BEGIN_BASE_FONT_CHAR ) )
  133                   {
  134                       Number cosCount = (Number)previousToken;
  135                       for( int j=0; j<cosCount.intValue(); j++ )
  136                       {
  137                           byte[] inputCode = (byte[])parseNextToken( cmapStream );
  138                           Object nextToken = parseNextToken( cmapStream );
  139                           if( nextToken instanceof byte[] )
  140                           {
  141                               byte[] bytes = (byte[])nextToken;
  142                               String value = createStringFromBytes( bytes );
  143                               result.addMapping( inputCode, value );
  144                           }
  145                           else if( nextToken instanceof LiteralName )
  146                           {
  147                               result.addMapping( inputCode, ((LiteralName)nextToken).name );
  148                           }
  149                           else
  150                           {
  151                               throw new IOException( "Error parsing CMap beginbfchar, expected{COSString " +
  152                                                      "or COSName} and not " + nextToken );
  153                           }
  154                       }
  155                   }
  156                  else if( op.op.equals( BEGIN_BASE_FONT_RANGE ) )
  157                   {
  158                       Number cosCount = (Number)previousToken;
  159                       
  160                       for( int j=0; j<cosCount.intValue(); j++ )
  161                       {
  162                           byte[] startCode = (byte[])parseNextToken( cmapStream );
  163                           byte[] endCode = (byte[])parseNextToken( cmapStream );
  164                           Object nextToken = parseNextToken( cmapStream );
  165                           List<byte[]> array = null;
  166                           byte[] tokenBytes = null;
  167                           if( nextToken instanceof List )
  168                           {
  169                               array = (List)nextToken;
  170                               tokenBytes = (byte[])array.get( 0 );
  171                           }
  172                           else
  173                           {
  174                               tokenBytes = (byte[])nextToken;
  175                           }
  176                           
  177                           String value = null;
  178                           
  179                           int arrayIndex = 0;
  180                           boolean done = false;
  181                           while( !done )
  182                           {
  183                               if( compare( startCode, endCode ) >= 0 )
  184                               {
  185                                   done = true;
  186                               }
  187                               value = createStringFromBytes( tokenBytes );
  188                               result.addMapping( startCode, value );
  189                               increment( startCode );
  190                               
  191                               if( array == null )
  192                               {
  193                                   increment( tokenBytes );
  194                               }
  195                               else
  196                               {
  197                                   arrayIndex++;
  198                                   if( arrayIndex < array.size() )
  199                                   {
  200                                       tokenBytes = (byte[])array.get( arrayIndex );
  201                                   }
  202                               }
  203                           }
  204                       }
  205                   }
  206               }
  207               previousToken = token;
  208           }
  209           return result;
  210       }
  211       
  212       private Object parseNextToken( PushbackInputStream is ) throws IOException
  213       {
  214           Object retval = null;
  215           int nextByte = is.read();
  216           //skip whitespace
  217           while( nextByte == 0x09 || nextByte == 0x20 || nextByte == 0x0D || nextByte == 0x0A )
  218           {
  219               nextByte = is.read();
  220           }
  221           switch( nextByte )
  222           {
  223               case '%':
  224               {
  225                   //header operations, for now return the entire line 
  226                   //may need to smarter in the future
  227                   StringBuffer buffer = new StringBuffer();
  228                   buffer.append( (char)nextByte );
  229                   readUntilEndOfLine( is, buffer );
  230                   retval = buffer.toString();
  231                   break;
  232               }
  233               case '(':
  234               {
  235                   StringBuffer buffer = new StringBuffer();
  236                   int stringByte = is.read();
  237                   
  238                   while( stringByte != -1 && stringByte != ')' )
  239                   {
  240                       buffer.append( (char)stringByte );
  241                       stringByte = is.read();
  242                   }
  243                   retval = buffer.toString();
  244                   break;
  245               }
  246               case '>':
  247               {
  248                   int secondCloseBrace = is.read();
  249                   if( secondCloseBrace == '>' )
  250                   {
  251                       retval = MARK_END_OF_DICTIONARY;
  252                   }
  253                   else
  254                   {
  255                       throw new IOException( "Error: expected the end of a dictionary.");
  256                   }
  257                   break;
  258               }
  259               case ']':
  260               {
  261                   retval = MARK_END_OF_ARRAY;
  262                   break;
  263               }
  264               case '[':
  265               {
  266                   List<Object> list = new ArrayList<Object>();
  267                   
  268                   Object nextToken = parseNextToken( is ); 
  269                   while( nextToken != null && nextToken != MARK_END_OF_ARRAY )
  270                   {
  271                       list.add( nextToken );
  272                       nextToken = parseNextToken( is );
  273                   }
  274                   retval = list;
  275                   break;
  276               }
  277               case '<':
  278               {
  279                   int theNextByte = is.read();
  280                   if( theNextByte == '<' )
  281                   {
  282                       Map<String,Object> result = new HashMap<String,Object>();
  283                       //we are reading a dictionary
  284                       Object key = parseNextToken( is ); 
  285                       while( key instanceof LiteralName && key != MARK_END_OF_DICTIONARY )
  286                       {
  287                           Object value = parseNextToken( is );
  288                           result.put( ((LiteralName)key).name, value );
  289                           key = parseNextToken( is );
  290                       }
  291                       retval = result;
  292                   }
  293                   else
  294                   {
  295                       //won't read more than 512 bytes
  296                       
  297                       int multiplyer = 16;
  298                       int bufferIndex = -1;
  299                       while( theNextByte != -1 && theNextByte != '>' )
  300                       {
  301                           int intValue = 0;
  302                           if( theNextByte >= '0' && theNextByte <= '9' )
  303                           {
  304                               intValue = theNextByte - '0';
  305                           }
  306                           else if( theNextByte >= 'A' && theNextByte <= 'F' )
  307                           {
  308                               intValue = 10 + theNextByte - 'A';
  309                           }
  310                           else if( theNextByte >= 'a' && theNextByte <= 'f' )
  311                           {
  312                               intValue = 10 + theNextByte - 'a';
  313                           }
  314                           else
  315                           {
  316                               throw new IOException( "Error: expected hex character and not " + 
  317                                   (char)theNextByte + ":" + theNextByte );
  318                           }
  319                           intValue *= multiplyer;
  320                           if( multiplyer == 16 )
  321                           {
  322                               bufferIndex++;
  323                               tokenParserByteBuffer[bufferIndex] = 0;
  324                               multiplyer = 1;
  325                           }
  326                           else
  327                           {
  328                               multiplyer = 16;
  329                           }
  330                           tokenParserByteBuffer[bufferIndex]+= intValue;
  331                           theNextByte = is.read();
  332                       }
  333                       byte[] finalResult = new byte[bufferIndex+1];
  334                       System.arraycopy(tokenParserByteBuffer,0,finalResult, 0, bufferIndex+1);
  335                       retval = finalResult;
  336                   }
  337                   break;
  338               }
  339               case '/':
  340               {
  341                   StringBuffer buffer = new StringBuffer();
  342                   int stringByte = is.read();
  343                   
  344                   while( !isWhitespaceOrEOF( stringByte ) )
  345                   {
  346                       buffer.append( (char)stringByte );
  347                       stringByte = is.read();
  348                   }
  349                   retval = new LiteralName( buffer.toString() );
  350                   break;
  351               }
  352               case -1:
  353               {
  354                   //EOF return null;
  355                   break;
  356               }
  357               case '0':
  358               case '1':
  359               case '2':
  360               case '3':
  361               case '4':
  362               case '5':
  363               case '6':
  364               case '7':
  365               case '8':
  366               case '9':
  367               {
  368                   StringBuffer buffer = new StringBuffer();
  369                   buffer.append( (char)nextByte );
  370                   nextByte = is.read();
  371                   
  372                   while( !isWhitespaceOrEOF( nextByte ) &&
  373                           (Character.isDigit( (char)nextByte )||
  374                            nextByte == '.' ) )
  375                   {
  376                       buffer.append( (char)nextByte );
  377                       nextByte = is.read();
  378                   }
  379                   is.unread( nextByte );
  380                   String value = buffer.toString();
  381                   if( value.indexOf( '.' ) >=0 )
  382                   {
  383                       retval = new Double( value );
  384                   }
  385                   else
  386                   {
  387                       retval = new Integer( buffer.toString() );
  388                   }
  389                   break;
  390               }
  391               default:
  392               {
  393                   StringBuffer buffer = new StringBuffer();
  394                   buffer.append( (char)nextByte );
  395                   nextByte = is.read();
  396                   
  397                   while( !isWhitespaceOrEOF( nextByte ) )
  398                   {
  399                       buffer.append( (char)nextByte );
  400                       nextByte = is.read();
  401                   }
  402                   retval = new Operator( buffer.toString() );                        
  403                   
  404                   break;
  405               }
  406           }
  407           return retval;
  408       }
  409       
  410       private void readUntilEndOfLine( InputStream is, StringBuffer buf ) throws IOException
  411       {
  412           int nextByte = is.read();
  413           while( nextByte != -1 && nextByte != 0x0D && nextByte != 0x0A )
  414           {
  415               buf.append( (char)nextByte );
  416               nextByte = is.read();
  417           }
  418       }
  419       
  420       private boolean isWhitespaceOrEOF( int aByte )
  421       {
  422           return aByte == -1 || aByte == 0x20 || aByte == 0x0D || aByte == 0x0A; 
  423       }
  424       
  425   
  426       private void increment( byte[] data )
  427       {
  428           increment( data, data.length-1 );
  429       }
  430   
  431       private void increment( byte[] data, int position )
  432       {
  433           if( position > 0 && (data[position]+256)%256 == 255 )
  434           {
  435               data[position]=0;
  436               increment( data, position-1);
  437           }
  438           else
  439           {
  440               data[position] = (byte)(data[position]+1);
  441           }
  442       }
  443       
  444       private String createStringFromBytes( byte[] bytes ) throws IOException
  445       {
  446           String retval = null;
  447           if( bytes.length == 1 )
  448           {
  449               retval = new String( bytes );
  450           }
  451           else
  452           {
  453               retval = new String( bytes, "UTF-16BE" );
  454           }
  455           return retval;
  456       }
  457   
  458       private int compare( byte[] first, byte[] second )
  459       {
  460           int retval = 1;
  461           boolean done = false;
  462           for( int i=0; i<first.length && !done; i++ )
  463           {
  464               if( first[i] == second[i] )
  465               {
  466                   //move to next position
  467               }
  468               else if( ((first[i]+256)%256) < ((second[i]+256)%256) )
  469               {
  470                   done = true;
  471                   retval = -1;
  472               }
  473               else
  474               {
  475                   done = true;
  476                   retval = 1;
  477               }
  478           }
  479           return retval;
  480       }
  481       
  482       /**
  483        * Internal class.
  484        */
  485       private class LiteralName
  486       {
  487           private String name;
  488           private LiteralName( String theName )
  489           {
  490               name = theName;
  491           }
  492       }
  493       
  494       /**
  495        * Internal class.
  496        */
  497       private class Operator
  498       {
  499           private String op;
  500           private Operator( String theOp )
  501           {
  502               op = theOp;
  503           }
  504       }
  505       
  506       /**
  507        * A simple class to test parsing of cmap files.
  508        * 
  509        * @param args Some command line arguments.
  510        * 
  511        * @throws Exception If there is an error parsing the file.
  512        */
  513       public static void main( String[] args ) throws Exception
  514       {
  515           if( args.length != 1 )
  516           {
  517               System.err.println( "usage: java org.pdfbox.cmapparser.CMapParser <CMAP File>" );
  518               System.exit( -1 );
  519           }
  520           CMapParser parser = new CMapParser(  );
  521           File cmapFile = new File( args[0] );
  522           CMap result = parser.parse( cmapFile );
  523           System.out.println( "Result:" + result );
  524       }
  525   }

Home » pdfbox-1.1.0-src » org.apache.fontbox.cmap » [javadoc | source]