Home » pdfbox-1.1.0-src » org.apache.pdfbox.encoding » [javadoc | source]

    1   /*
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *      http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   package org.apache.pdfbox.encoding;
   18   
   19   import java.io.BufferedReader;
   20   import java.io.IOException;
   21   import java.io.InputStream;
   22   import java.io.InputStreamReader;
   23   import java.util.Collections;
   24   import java.util.HashMap;
   25   import java.util.Map;
   26   import java.util.StringTokenizer;
   27   import java.util.logging.Level;
   28   import java.util.logging.Logger;
   29   import java.io.File;
   30   
   31   import org.apache.pdfbox.cos.COSName;
   32   import org.apache.pdfbox.pdmodel.common.COSObjectable;
   33   import org.apache.pdfbox.util.ResourceLoader;
   34   
   35   /**
   36    * This is an interface to a text encoder.
   37    *
   38    * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
   39    * @version $Revision: 1.15 $
   40    */
   41   public abstract class Encoding implements COSObjectable
   42   {
   43       /**
   44        * This is a mapping from a character code to a character name.
   45        */
   46       protected final Map<Integer, COSName> codeToName =
   47           new HashMap<Integer, COSName>();
   48   
   49       /**
   50        * This is a mapping from a character name to a character code.
   51        */
   52       protected final Map<COSName, Integer> nameToCode =
   53           new HashMap<COSName, Integer>();
   54   
   55       private static final Map<COSName, String> NAME_TO_CHARACTER =
   56           new HashMap<COSName, String>();
   57   
   58       private static final Map<String, COSName> CHARACTER_TO_NAME =
   59           new HashMap<String, COSName>();
   60   
   61       static
   62       {
   63           //Loads the official Adobe Glyph List
   64           loadGlyphList("Resources/glyphlist.txt");
   65   
   66           // Load an external glyph list file that user can give as JVM property
   67           String location = System.getProperty("glyphlist_ext");
   68           if(location != null)
   69           {
   70               File external = new File(location);
   71               if(external.exists())
   72               {
   73                   loadGlyphList(location);
   74               }
   75           }
   76   
   77           NAME_TO_CHARACTER.put( COSName.getPDFName( ".notdef" ), "" );
   78           NAME_TO_CHARACTER.put( COSName.getPDFName( "fi" ), "fi" );
   79           NAME_TO_CHARACTER.put( COSName.getPDFName( "fl" ), "fl" );
   80           NAME_TO_CHARACTER.put( COSName.getPDFName( "ffi" ), "ffi" );
   81           NAME_TO_CHARACTER.put( COSName.getPDFName( "ff" ), "ff" );
   82           NAME_TO_CHARACTER.put( COSName.getPDFName( "pi" ), "pi" );
   83   
   84           // add some (alternative) glyph mappings. These are missing in
   85           // the original copy of the adobe glyphlist.txt 
   86           // also mapped as anglebracketleft
   87           NAME_TO_CHARACTER.put(COSName.getPDFName("angbracketleft"), "\u3008");
   88           // also mapped as anglebracketright
   89           NAME_TO_CHARACTER.put(COSName.getPDFName("angbracketright"), "\u3009");
   90           // also mapped as copyright
   91           NAME_TO_CHARACTER.put(COSName.getPDFName("circlecopyrt"), "\u00A9");
   92           NAME_TO_CHARACTER.put(COSName.getPDFName("controlNULL"), "\u0000");
   93   
   94           for( Map.Entry<COSName, String> entry : NAME_TO_CHARACTER.entrySet() )
   95           {
   96               CHARACTER_TO_NAME.put( entry.getValue(), entry.getKey() );
   97           }
   98       }
   99   
  100       /**
  101        * Loads a glyph list from a given location and populates the NAME_TO_CHARACTER hashmap
  102        * for character lookups.
  103        * @param location - The string location of the glyphlist file 
  104        */
  105       private static void loadGlyphList(String location)
  106       {
  107           BufferedReader glyphStream = null;
  108           try
  109           {
  110               InputStream resource = ResourceLoader.loadResource( location );
  111               glyphStream = new BufferedReader( new InputStreamReader( resource ) );
  112               String line = null;
  113               while( (line = glyphStream.readLine()) != null )
  114               {
  115                   line = line.trim();
  116                   //lines starting with # are comments which we can ignore.
  117                   if( !line.startsWith("#" ) )
  118                   {
  119                       int semicolonIndex = line.indexOf( ';' );
  120                       if( semicolonIndex >= 0 )
  121                       {
  122                           try
  123                           {
  124                               String characterName = line.substring( 0, semicolonIndex );
  125                               String unicodeValue = line.substring( semicolonIndex+1, line.length() );
  126                               StringTokenizer tokenizer = new StringTokenizer( unicodeValue, " ", false );
  127                               String value = "";
  128                               while(tokenizer.hasMoreTokens())
  129                               {
  130                                   int characterCode = Integer.parseInt( tokenizer.nextToken(), 16 );
  131                                   value += (char)characterCode;
  132                               }
  133                               NAME_TO_CHARACTER.put( COSName.getPDFName( characterName ), value );
  134                           }
  135                           catch( NumberFormatException nfe )
  136                           {
  137                               nfe.printStackTrace();
  138                           }
  139                       }
  140                   }
  141               }
  142           }
  143           catch( IOException io )
  144           {
  145               io.printStackTrace();
  146           }
  147           finally
  148           {
  149               if( glyphStream != null )
  150               {
  151                   try
  152                   {
  153                       glyphStream.close();
  154                   }
  155                   catch( IOException e )
  156                   {
  157                       e.printStackTrace();
  158                   }
  159   
  160               }
  161           }
  162       }
  163   
  164       /**
  165        * Returns an unmodifiable view of the Code2Name mapping.
  166        * @return the Code2Name map 
  167        */
  168       public Map<Integer, COSName> getCodeToNameMap()
  169       {
  170           return Collections.unmodifiableMap(codeToName);
  171       }
  172         
  173       /**
  174        * Returns an unmodifiable view of the Name2Code mapping.
  175        * @return the Name2Code map 
  176        */
  177       public Map<COSName, Integer> getNameToCodeMap()
  178       {
  179           return Collections.unmodifiableMap(nameToCode);
  180       }
  181   
  182       /**
  183        * This will add a character encoding.
  184        *
  185        * @param code The character code that matches the character.
  186        * @param name The name of the character.
  187        */
  188       protected void addCharacterEncoding( int code, COSName name )
  189       {
  190           codeToName.put( code, name );
  191           nameToCode.put( name, code );
  192       }
  193   
  194       /**
  195        * This will get the character code for the name.
  196        *
  197        * @param name The name of the character.
  198        *
  199        * @return The code for the character.
  200        *
  201        * @throws IOException If there is no character code for the name.
  202        */
  203       public int getCode( COSName name ) throws IOException
  204       {
  205           Integer code = nameToCode.get( name );
  206           if( code == null )
  207           {
  208               throw new IOException( "No character code for character name '" + name.getName() + "'" );
  209           }
  210           return code;
  211       }
  212   
  213       /**
  214        * This will take a character code and get the name from the code.
  215        *
  216        * @param code The character code.
  217        *
  218        * @return The name of the character.
  219        *
  220        * @throws IOException If there is no name for the code.
  221        */
  222       public COSName getName( int code ) throws IOException
  223       {
  224           COSName name = codeToName.get( code );
  225           if( name == null )
  226           {
  227               //lets be forgiving for now
  228               name = COSName.getPDFName( "space" );
  229               //throw new IOException( getClass().getName() +
  230               //                       ": No name for character code '" + code + "'" );
  231           }
  232           return name;
  233       }
  234   
  235       /**
  236        * This will take a character code and get the name from the code.
  237        *
  238        * @param c The character.
  239        *
  240        * @return The name of the character.
  241        *
  242        * @throws IOException If there is no name for the character.
  243        */
  244       public COSName getNameFromCharacter( char c ) throws IOException
  245       {
  246           COSName name = CHARACTER_TO_NAME.get( Character.toString(c) );
  247           if( name == null )
  248           {
  249               throw new IOException( "No name for character '" + c + "'" );
  250           }
  251           return name;
  252       }
  253   
  254       /**
  255        * This will get the character from the code.
  256        *
  257        * @param code The character code.
  258        *
  259        * @return The printable character for the code.
  260        *
  261        * @throws IOException If there is not name for the character.
  262        */
  263       public String getCharacter( int code ) throws IOException
  264       {
  265           String character = getCharacter( getName( code ) );
  266           return character;
  267       }
  268   
  269       /**
  270        * This will get the character from the name.
  271        *
  272        * @param name The name of the character.
  273        *
  274        * @return The printable character for the code.
  275        */
  276       public static String getCharacter( COSName name )
  277       {
  278           COSName baseName = name;
  279    
  280           String character = NAME_TO_CHARACTER.get( baseName );
  281           if( character == null )
  282           {
  283               String nameStr = baseName.getName();
  284               // test for Unicode name
  285               // (uniXXXX - XXXX must be a multiple of four;
  286               // each representing a hexadecimal Unicode code point)
  287               if ( nameStr.startsWith( "uni" ) )
  288               {
  289                   StringBuilder uniStr = new StringBuilder();
  290   
  291                   for ( int chPos = 3; chPos + 4 <= nameStr.length(); chPos += 4 )
  292                   {
  293                       try 
  294                       {
  295                           int characterCode = Integer.parseInt( nameStr.substring( chPos, chPos + 4), 16 );
  296   
  297                           if ( ( characterCode > 0xD7FF ) && ( characterCode < 0xE000 ) )
  298                           {
  299                               Logger.getLogger(Encoding.class.getName()).log( Level.WARNING,
  300                                       "Unicode character name with not allowed code area: " +
  301                                       nameStr );
  302                           }
  303                           else
  304                           {
  305                               uniStr.append( (char) characterCode );
  306                           }
  307                       } 
  308                       catch (NumberFormatException nfe) 
  309                       {
  310                           Logger.getLogger(Encoding.class.getName()).log( Level.WARNING,
  311                                   "Not a number in Unicode character name: " +
  312                                   nameStr );
  313                       }
  314                   }
  315                   character = uniStr.toString();
  316               }
  317               else 
  318               {
  319                   // test if we have a suffix and if so remove it
  320                   if ( nameStr.indexOf('.') > 0 ) 
  321                   {
  322                       nameStr = nameStr.substring( 0, nameStr.indexOf('.') );
  323                       baseName = COSName.getPDFName( nameStr );
  324                       getCharacter(baseName);
  325                   }
  326   
  327                  character = nameStr;
  328               }
  329           }
  330           return character;
  331       } 
  332   
  333   }

Home » pdfbox-1.1.0-src » org.apache.pdfbox.encoding » [javadoc | source]