1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.pdfbox.searchengine.lucene; 18 19 import java.io.File; 20 import java.io.FileInputStream; 21 import java.io.InputStream; 22 import java.io.IOException; 23 import java.io.Reader; 24 import java.io.StringReader; 25 import java.io.StringWriter; 26 import java.util.Calendar; 27 28 import java.net.URL; 29 import java.net.URLConnection; 30 31 import java.util.Date; 32 33 import org.apache.lucene.document.DateTools; 34 import org.apache.lucene.document.Document; 35 import org.apache.lucene.document.Field; 36 37 import org.apache.pdfbox.pdmodel.PDDocument; 38 import org.apache.pdfbox.pdmodel.PDDocumentInformation; 39 40 import org.apache.pdfbox.exceptions.CryptographyException; 41 import org.apache.pdfbox.exceptions.InvalidPasswordException; 42 43 import org.apache.pdfbox.util.PDFTextStripper; 44 45 /** 46 * This class is used to create a document for the lucene search engine. 47 * This should easily plug into the IndexHTML or IndexFiles that comes with 48 * the lucene project. This class will populate the following fields. 49 * <table> 50 * <tr> 51 * <th>Lucene Field Name</th> 52 * <th>Description</th> 53 * </tr> 54 * <tr> 55 * <td>path</td> 56 * <td>File system path if loaded from a file</td> 57 * </tr> 58 * <tr> 59 * <td>url</td> 60 * <td>URL to PDF document</td> 61 * </tr> 62 * <tr> 63 * <td>contents</td> 64 * <td>Entire contents of PDF document, indexed but not stored</td> 65 * </tr> 66 * <tr> 67 * <td>summary</td> 68 * <td>First 500 characters of content</td> 69 * </tr> 70 * <tr> 71 * <td>modified</td> 72 * <td>The modified date/time according to the url or path</td> 73 * </tr> 74 * <tr> 75 * <td>uid</td> 76 * <td>A unique identifier for the Lucene document.</td> 77 * </tr> 78 * <tr> 79 * <td>CreationDate</td> 80 * <td>From PDF meta-data if available</td> 81 * </tr> 82 * <tr> 83 * <td>Creator</td> 84 * <td>From PDF meta-data if available</td> 85 * </tr> 86 * <tr> 87 * <td>Keywords</td> 88 * <td>From PDF meta-data if available</td> 89 * </tr> 90 * <tr> 91 * <td>ModificationDate</td> 92 * <td>From PDF meta-data if available</td> 93 * </tr> 94 * <tr> 95 * <td>Producer</td> 96 * <td>From PDF meta-data if available</td> 97 * </tr> 98 * <tr> 99 * <td>Subject</td> 100 * <td>From PDF meta-data if available</td> 101 * </tr> 102 * <tr> 103 * <td>Trapped</td> 104 * <td>From PDF meta-data if available</td> 105 * </tr> 106 * </table> 107 * 108 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> 109 * @version $Revision: 1.23 $ 110 */ 111 public final class LucenePDFDocument 112 { 113 private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0); 114 115 // given caveat of increased search times when using 116 //MICROSECOND, only use SECOND by default 117 private DateTools.Resolution dateTimeResolution = DateTools.Resolution.SECOND; 118 119 private PDFTextStripper stripper = null; 120 121 /** 122 * Constructor. 123 */ 124 public LucenePDFDocument() 125 { 126 } 127 128 /** 129 * Set the text stripper that will be used during extraction. 130 * 131 * @param aStripper The new pdf text stripper. 132 */ 133 public void setTextStripper( PDFTextStripper aStripper ) 134 { 135 stripper = aStripper; 136 } 137 138 /** 139 * Get the Lucene data time resolution. 140 * 141 * @return current date/time resolution 142 */ 143 public DateTools.Resolution getDateTimeResolution() 144 { 145 return dateTimeResolution; 146 } 147 148 /** 149 * Set the Lucene data time resolution. 150 * 151 * @param resolution set new date/time resolution 152 */ 153 public void setDateTimeResolution( DateTools.Resolution resolution ) 154 { 155 dateTimeResolution = resolution; 156 } 157 158 // 159 // compatibility methods for lucene-1.9+ 160 // 161 private String timeToString( long time ) 162 { 163 return DateTools.timeToString( time, dateTimeResolution ); 164 } 165 166 private void addKeywordField( Document document, String name, String value ) 167 { 168 if ( value != null ) 169 { 170 document.add( new Field( name, value, Field.Store.YES, Field.Index.NOT_ANALYZED ) ); 171 } 172 } 173 174 private void addTextField( Document document, String name, Reader value ) 175 { 176 if ( value != null ) 177 { 178 document.add( new Field( name, value ) ); 179 } 180 } 181 182 private void addTextField( Document document, String name, String value ) 183 { 184 if ( value != null ) 185 { 186 document.add( new Field( name, value, Field.Store.YES, Field.Index.ANALYZED ) ); 187 } 188 } 189 190 private void addTextField( Document document, String name, Date value ) 191 { 192 if ( value != null ) 193 { 194 addTextField( document, name, DateTools.dateToString( value, dateTimeResolution ) ); 195 } 196 } 197 198 private void addTextField( Document document, String name, Calendar value ) 199 { 200 if ( value != null ) 201 { 202 addTextField( document, name, value.getTime() ); 203 } 204 } 205 206 private static void addUnindexedField( Document document, String name, String value ) 207 { 208 if ( value != null ) 209 { 210 document.add( new Field( name, value, Field.Store.YES, Field.Index.NO ) ); 211 } 212 } 213 214 private void addUnstoredKeywordField( Document document, String name, String value ) 215 { 216 if ( value != null ) 217 { 218 document.add( new Field( name, value, Field.Store.NO, Field.Index.NOT_ANALYZED ) ); 219 } 220 } 221 222 /** 223 * Convert the PDF stream to a lucene document. 224 * 225 * @param is The input stream. 226 * @return The input stream converted to a lucene document. 227 * @throws IOException If there is an error converting the PDF. 228 */ 229 public Document convertDocument( InputStream is ) throws IOException 230 { 231 Document document = new Document(); 232 addContent( document, is, "<inputstream>" ); 233 return document; 234 235 } 236 237 /** 238 * This will take a reference to a PDF document and create a lucene document. 239 * 240 * @param file A reference to a PDF document. 241 * @return The converted lucene document. 242 * 243 * @throws IOException If there is an exception while converting the document. 244 */ 245 public Document convertDocument( File file ) throws IOException 246 { 247 Document document = new Document(); 248 249 // Add the url as a field named "url". Use an UnIndexed field, so 250 // that the url is just stored with the document, but is not searchable. 251 addUnindexedField( document, "path", file.getPath() ); 252 addUnindexedField( document, "url", file.getPath().replace(FILE_SEPARATOR, '/') ); 253 254 // Add the last modified date of the file a field named "modified". Use a 255 // Keyword field, so that it's searchable, but so that no attempt is made 256 // to tokenize the field into words. 257 addKeywordField( document, "modified", timeToString( file.lastModified() ) ); 258 259 String uid = file.getPath().replace(FILE_SEPARATOR,'\u0000') 260 + "\u0000" 261 + timeToString( file.lastModified() ); 262 263 // Add the uid as a field, so that index can be incrementally maintained. 264 // This field is not stored with document, it is indexed, but it is not 265 // tokenized prior to indexing. 266 addUnstoredKeywordField( document, "uid", uid ); 267 268 FileInputStream input = null; 269 try 270 { 271 input = new FileInputStream( file ); 272 addContent( document, input, file.getPath() ); 273 } 274 finally 275 { 276 if( input != null ) 277 { 278 input.close(); 279 } 280 } 281 282 283 // return the document 284 285 return document; 286 } 287 288 /** 289 * Convert the document from a PDF to a lucene document. 290 * 291 * @param url A url to a PDF document. 292 * @return The PDF converted to a lucene document. 293 * @throws IOException If there is an error while converting the document. 294 */ 295 public Document convertDocument( URL url ) throws IOException 296 { 297 Document document = new Document(); 298 URLConnection connection = url.openConnection(); 299 connection.connect(); 300 // Add the url as a field named "url". Use an UnIndexed field, so 301 // that the url is just stored with the document, but is not searchable. 302 addUnindexedField( document, "url", url.toExternalForm() ); 303 304 // Add the last modified date of the file a field named "modified". Use a 305 // Keyword field, so that it's searchable, but so that no attempt is made 306 // to tokenize the field into words. 307 addKeywordField( document, "modified", timeToString(connection.getLastModified() ) ); 308 309 String uid = url.toExternalForm().replace(FILE_SEPARATOR, '\u0000') 310 + "\u0000" 311 + timeToString( connection.getLastModified() ); 312 313 // Add the uid as a field, so that index can be incrementally maintained. 314 // This field is not stored with document, it is indexed, but it is not 315 // tokenized prior to indexing. 316 addUnstoredKeywordField( document, "uid", uid ); 317 318 InputStream input = null; 319 try 320 { 321 input = connection.getInputStream(); 322 addContent( document, input,url.toExternalForm() ); 323 } 324 finally 325 { 326 if( input != null ) 327 { 328 input.close(); 329 } 330 } 331 332 // return the document 333 return document; 334 } 335 336 /** 337 * This will get a lucene document from a PDF file. 338 * 339 * @param is The stream to read the PDF from. 340 * 341 * @return The lucene document. 342 * 343 * @throws IOException If there is an error parsing or indexing the document. 344 */ 345 public static Document getDocument( InputStream is ) throws IOException 346 { 347 LucenePDFDocument converter = new LucenePDFDocument(); 348 return converter.convertDocument( is ); 349 } 350 351 /** 352 * This will get a lucene document from a PDF file. 353 * 354 * @param file The file to get the document for. 355 * 356 * @return The lucene document. 357 * 358 * @throws IOException If there is an error parsing or indexing the document. 359 */ 360 public static Document getDocument( File file ) throws IOException 361 { 362 LucenePDFDocument converter = new LucenePDFDocument(); 363 return converter.convertDocument( file ); 364 } 365 366 /** 367 * This will get a lucene document from a PDF file. 368 * 369 * @param url The file to get the document for. 370 * 371 * @return The lucene document. 372 * 373 * @throws IOException If there is an error parsing or indexing the document. 374 */ 375 public static Document getDocument( URL url ) throws IOException 376 { 377 LucenePDFDocument converter = new LucenePDFDocument(); 378 return converter.convertDocument( url ); 379 } 380 381 /** 382 * This will add the contents to the lucene document. 383 * 384 * @param document The document to add the contents to. 385 * @param is The stream to get the contents from. 386 * @param documentLocation The location of the document, used just for debug messages. 387 * 388 * @throws IOException If there is an error parsing the document. 389 */ 390 private void addContent( Document document, InputStream is, String documentLocation ) throws IOException 391 { 392 PDDocument pdfDocument = null; 393 try 394 { 395 pdfDocument = PDDocument.load( is ); 396 397 if( pdfDocument.isEncrypted() ) 398 { 399 //Just try using the default password and move on 400 pdfDocument.decrypt( "" ); 401 } 402 403 //create a writer where to append the text content. 404 StringWriter writer = new StringWriter(); 405 if( stripper == null ) 406 { 407 stripper = new PDFTextStripper(); 408 } 409 else 410 { 411 stripper.resetEngine(); 412 } 413 stripper.writeText( pdfDocument, writer ); 414 415 // Note: the buffer to string operation is costless; 416 // the char array value of the writer buffer and the content string 417 // is shared as long as the buffer content is not modified, which will 418 // not occur here. 419 String contents = writer.getBuffer().toString(); 420 421 StringReader reader = new StringReader( contents ); 422 423 // Add the tag-stripped contents as a Reader-valued Text field so it will 424 // get tokenized and indexed. 425 addTextField( document, "contents", reader ); 426 427 PDDocumentInformation info = pdfDocument.getDocumentInformation(); 428 if( info != null ) 429 { 430 addTextField( document, "Author", info.getAuthor() ); 431 try 432 { 433 addTextField( document, "CreationDate", info.getCreationDate() ); 434 } 435 catch( IOException io ) 436 { 437 //ignore, bad date but continue with indexing 438 } 439 addTextField( document, "Creator", info.getCreator() ); 440 addTextField( document, "Keywords", info.getKeywords() ); 441 try 442 { 443 addTextField( document, "ModificationDate", info.getModificationDate() ); 444 } 445 catch( IOException io ) 446 { 447 //ignore, bad date but continue with indexing 448 } 449 addTextField( document, "Producer", info.getProducer() ); 450 addTextField( document, "Subject", info.getSubject() ); 451 addTextField( document, "Title", info.getTitle() ); 452 addTextField( document, "Trapped", info.getTrapped() ); 453 } 454 int summarySize = Math.min( contents.length(), 500 ); 455 String summary = contents.substring( 0, summarySize ); 456 // Add the summary as an UnIndexed field, so that it is stored and returned 457 // with hit documents for display. 458 addUnindexedField( document, "summary", summary ); 459 } 460 catch( CryptographyException e ) 461 { 462 throw new IOException( "Error decrypting document(" + documentLocation + "): " + e ); 463 } 464 catch( InvalidPasswordException e ) 465 { 466 //they didn't suppply a password and the default of "" was wrong. 467 throw new IOException( 468 "Error: The document(" + documentLocation + 469 ") is encrypted and will not be indexed." ); 470 } 471 finally 472 { 473 if( pdfDocument != null ) 474 { 475 pdfDocument.close(); 476 } 477 } 478 } 479 480 /** 481 * This will test creating a document. 482 * 483 * usage: java pdfparser.searchengine.lucene.LucenePDFDocument <pdf-document> 484 * 485 * @param args command line arguments. 486 * 487 * @throws IOException If there is an error. 488 */ 489 public static void main( String[] args ) throws IOException 490 { 491 if( args.length != 1 ) 492 { 493 String us = LucenePDFDocument.class.getName(); 494 System.err.println( "usage: java " + us + " <pdf-document>" ); 495 System.exit( 1 ); 496 } 497 System.out.println( "Document=" + getDocument( new File( args[0] ) ) ); 498 } 499 }