1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.pdfbox.encoding; 18 19 import java.io.BufferedReader; 20 import java.io.IOException; 21 import java.io.InputStream; 22 import java.io.InputStreamReader; 23 import java.util.Collections; 24 import java.util.HashMap; 25 import java.util.Map; 26 import java.util.StringTokenizer; 27 import java.util.logging.Level; 28 import java.util.logging.Logger; 29 import java.io.File; 30 31 import org.apache.pdfbox.cos.COSName; 32 import org.apache.pdfbox.pdmodel.common.COSObjectable; 33 import org.apache.pdfbox.util.ResourceLoader; 34 35 /** 36 * This is an interface to a text encoder. 37 * 38 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> 39 * @version $Revision: 1.15 $ 40 */ 41 public abstract class Encoding implements COSObjectable 42 { 43 /** 44 * This is a mapping from a character code to a character name. 45 */ 46 protected final Map<Integer, COSName> codeToName = 47 new HashMap<Integer, COSName>(); 48 49 /** 50 * This is a mapping from a character name to a character code. 51 */ 52 protected final Map<COSName, Integer> nameToCode = 53 new HashMap<COSName, Integer>(); 54 55 private static final Map<COSName, String> NAME_TO_CHARACTER = 56 new HashMap<COSName, String>(); 57 58 private static final Map<String, COSName> CHARACTER_TO_NAME = 59 new HashMap<String, COSName>(); 60 61 static 62 { 63 //Loads the official Adobe Glyph List 64 loadGlyphList("Resources/glyphlist.txt"); 65 66 // Load an external glyph list file that user can give as JVM property 67 String location = System.getProperty("glyphlist_ext"); 68 if(location != null) 69 { 70 File external = new File(location); 71 if(external.exists()) 72 { 73 loadGlyphList(location); 74 } 75 } 76 77 NAME_TO_CHARACTER.put( COSName.getPDFName( ".notdef" ), "" ); 78 NAME_TO_CHARACTER.put( COSName.getPDFName( "fi" ), "fi" ); 79 NAME_TO_CHARACTER.put( COSName.getPDFName( "fl" ), "fl" ); 80 NAME_TO_CHARACTER.put( COSName.getPDFName( "ffi" ), "ffi" ); 81 NAME_TO_CHARACTER.put( COSName.getPDFName( "ff" ), "ff" ); 82 NAME_TO_CHARACTER.put( COSName.getPDFName( "pi" ), "pi" ); 83 84 // add some (alternative) glyph mappings. These are missing in 85 // the original copy of the adobe glyphlist.txt 86 // also mapped as anglebracketleft 87 NAME_TO_CHARACTER.put(COSName.getPDFName("angbracketleft"), "\u3008"); 88 // also mapped as anglebracketright 89 NAME_TO_CHARACTER.put(COSName.getPDFName("angbracketright"), "\u3009"); 90 // also mapped as copyright 91 NAME_TO_CHARACTER.put(COSName.getPDFName("circlecopyrt"), "\u00A9"); 92 NAME_TO_CHARACTER.put(COSName.getPDFName("controlNULL"), "\u0000"); 93 94 for( Map.Entry<COSName, String> entry : NAME_TO_CHARACTER.entrySet() ) 95 { 96 CHARACTER_TO_NAME.put( entry.getValue(), entry.getKey() ); 97 } 98 } 99 100 /** 101 * Loads a glyph list from a given location and populates the NAME_TO_CHARACTER hashmap 102 * for character lookups. 103 * @param location - The string location of the glyphlist file 104 */ 105 private static void loadGlyphList(String location) 106 { 107 BufferedReader glyphStream = null; 108 try 109 { 110 InputStream resource = ResourceLoader.loadResource( location ); 111 glyphStream = new BufferedReader( new InputStreamReader( resource ) ); 112 String line = null; 113 while( (line = glyphStream.readLine()) != null ) 114 { 115 line = line.trim(); 116 //lines starting with # are comments which we can ignore. 117 if( !line.startsWith("#" ) ) 118 { 119 int semicolonIndex = line.indexOf( ';' ); 120 if( semicolonIndex >= 0 ) 121 { 122 try 123 { 124 String characterName = line.substring( 0, semicolonIndex ); 125 String unicodeValue = line.substring( semicolonIndex+1, line.length() ); 126 StringTokenizer tokenizer = new StringTokenizer( unicodeValue, " ", false ); 127 String value = ""; 128 while(tokenizer.hasMoreTokens()) 129 { 130 int characterCode = Integer.parseInt( tokenizer.nextToken(), 16 ); 131 value += (char)characterCode; 132 } 133 NAME_TO_CHARACTER.put( COSName.getPDFName( characterName ), value ); 134 } 135 catch( NumberFormatException nfe ) 136 { 137 nfe.printStackTrace(); 138 } 139 } 140 } 141 } 142 } 143 catch( IOException io ) 144 { 145 io.printStackTrace(); 146 } 147 finally 148 { 149 if( glyphStream != null ) 150 { 151 try 152 { 153 glyphStream.close(); 154 } 155 catch( IOException e ) 156 { 157 e.printStackTrace(); 158 } 159 160 } 161 } 162 } 163 164 /** 165 * Returns an unmodifiable view of the Code2Name mapping. 166 * @return the Code2Name map 167 */ 168 public Map<Integer, COSName> getCodeToNameMap() 169 { 170 return Collections.unmodifiableMap(codeToName); 171 } 172 173 /** 174 * Returns an unmodifiable view of the Name2Code mapping. 175 * @return the Name2Code map 176 */ 177 public Map<COSName, Integer> getNameToCodeMap() 178 { 179 return Collections.unmodifiableMap(nameToCode); 180 } 181 182 /** 183 * This will add a character encoding. 184 * 185 * @param code The character code that matches the character. 186 * @param name The name of the character. 187 */ 188 protected void addCharacterEncoding( int code, COSName name ) 189 { 190 codeToName.put( code, name ); 191 nameToCode.put( name, code ); 192 } 193 194 /** 195 * This will get the character code for the name. 196 * 197 * @param name The name of the character. 198 * 199 * @return The code for the character. 200 * 201 * @throws IOException If there is no character code for the name. 202 */ 203 public int getCode( COSName name ) throws IOException 204 { 205 Integer code = nameToCode.get( name ); 206 if( code == null ) 207 { 208 throw new IOException( "No character code for character name '" + name.getName() + "'" ); 209 } 210 return code; 211 } 212 213 /** 214 * This will take a character code and get the name from the code. 215 * 216 * @param code The character code. 217 * 218 * @return The name of the character. 219 * 220 * @throws IOException If there is no name for the code. 221 */ 222 public COSName getName( int code ) throws IOException 223 { 224 COSName name = codeToName.get( code ); 225 if( name == null ) 226 { 227 //lets be forgiving for now 228 name = COSName.getPDFName( "space" ); 229 //throw new IOException( getClass().getName() + 230 // ": No name for character code '" + code + "'" ); 231 } 232 return name; 233 } 234 235 /** 236 * This will take a character code and get the name from the code. 237 * 238 * @param c The character. 239 * 240 * @return The name of the character. 241 * 242 * @throws IOException If there is no name for the character. 243 */ 244 public COSName getNameFromCharacter( char c ) throws IOException 245 { 246 COSName name = CHARACTER_TO_NAME.get( Character.toString(c) ); 247 if( name == null ) 248 { 249 throw new IOException( "No name for character '" + c + "'" ); 250 } 251 return name; 252 } 253 254 /** 255 * This will get the character from the code. 256 * 257 * @param code The character code. 258 * 259 * @return The printable character for the code. 260 * 261 * @throws IOException If there is not name for the character. 262 */ 263 public String getCharacter( int code ) throws IOException 264 { 265 String character = getCharacter( getName( code ) ); 266 return character; 267 } 268 269 /** 270 * This will get the character from the name. 271 * 272 * @param name The name of the character. 273 * 274 * @return The printable character for the code. 275 */ 276 public static String getCharacter( COSName name ) 277 { 278 COSName baseName = name; 279 280 String character = NAME_TO_CHARACTER.get( baseName ); 281 if( character == null ) 282 { 283 String nameStr = baseName.getName(); 284 // test for Unicode name 285 // (uniXXXX - XXXX must be a multiple of four; 286 // each representing a hexadecimal Unicode code point) 287 if ( nameStr.startsWith( "uni" ) ) 288 { 289 StringBuilder uniStr = new StringBuilder(); 290 291 for ( int chPos = 3; chPos + 4 <= nameStr.length(); chPos += 4 ) 292 { 293 try 294 { 295 int characterCode = Integer.parseInt( nameStr.substring( chPos, chPos + 4), 16 ); 296 297 if ( ( characterCode > 0xD7FF ) && ( characterCode < 0xE000 ) ) 298 { 299 Logger.getLogger(Encoding.class.getName()).log( Level.WARNING, 300 "Unicode character name with not allowed code area: " + 301 nameStr ); 302 } 303 else 304 { 305 uniStr.append( (char) characterCode ); 306 } 307 } 308 catch (NumberFormatException nfe) 309 { 310 Logger.getLogger(Encoding.class.getName()).log( Level.WARNING, 311 "Not a number in Unicode character name: " + 312 nameStr ); 313 } 314 } 315 character = uniStr.toString(); 316 } 317 else 318 { 319 // test if we have a suffix and if so remove it 320 if ( nameStr.indexOf('.') > 0 ) 321 { 322 nameStr = nameStr.substring( 0, nameStr.indexOf('.') ); 323 baseName = COSName.getPDFName( nameStr ); 324 getCharacter(baseName); 325 } 326 327 character = nameStr; 328 } 329 } 330 return character; 331 } 332 333 }