Home » pdfbox-1.1.0-src » org.apache.pdfbox.searchengine.lucene » [javadoc | source]

    1   /*
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *      http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   package org.apache.pdfbox.searchengine.lucene;
   18   
   19   import java.io.File;
   20   import java.io.FileInputStream;
   21   import java.io.InputStream;
   22   import java.io.IOException;
   23   import java.io.Reader;
   24   import java.io.StringReader;
   25   import java.io.StringWriter;
   26   import java.util.Calendar;
   27   
   28   import java.net.URL;
   29   import java.net.URLConnection;
   30   
   31   import java.util.Date;
   32   
   33   import org.apache.lucene.document.DateTools;
   34   import org.apache.lucene.document.Document;
   35   import org.apache.lucene.document.Field;
   36   
   37   import org.apache.pdfbox.pdmodel.PDDocument;
   38   import org.apache.pdfbox.pdmodel.PDDocumentInformation;
   39   
   40   import org.apache.pdfbox.exceptions.CryptographyException;
   41   import org.apache.pdfbox.exceptions.InvalidPasswordException;
   42   
   43   import org.apache.pdfbox.util.PDFTextStripper;
   44   
   45   /**
   46    * This class is used to create a document for the lucene search engine.
   47    * This should easily plug into the IndexHTML or IndexFiles that comes with
   48    * the lucene project.  This class will populate the following fields.
   49    * <table>
   50    *      <tr>
   51    *          <th>Lucene Field Name</th>
   52    *          <th>Description</th>
   53    *      </tr>
   54    *      <tr>
   55    *          <td>path</td>
   56    *          <td>File system path if loaded from a file</td>
   57    *      </tr>
   58    *      <tr>
   59    *          <td>url</td>
   60    *          <td>URL to PDF document</td>
   61    *      </tr>
   62    *      <tr>
   63    *          <td>contents</td>
   64    *          <td>Entire contents of PDF document, indexed but not stored</td>
   65    *      </tr>
   66    *      <tr>
   67    *          <td>summary</td>
   68    *          <td>First 500 characters of content</td>
   69    *      </tr>
   70    *      <tr>
   71    *          <td>modified</td>
   72    *          <td>The modified date/time according to the url or path</td>
   73    *      </tr>
   74    *      <tr>
   75    *          <td>uid</td>
   76    *          <td>A unique identifier for the Lucene document.</td>
   77    *      </tr>
   78    *      <tr>
   79    *          <td>CreationDate</td>
   80    *          <td>From PDF meta-data if available</td>
   81    *      </tr>
   82    *      <tr>
   83    *          <td>Creator</td>
   84    *          <td>From PDF meta-data if available</td>
   85    *      </tr>
   86    *      <tr>
   87    *          <td>Keywords</td>
   88    *          <td>From PDF meta-data if available</td>
   89    *      </tr>
   90    *      <tr>
   91    *          <td>ModificationDate</td>
   92    *          <td>From PDF meta-data if available</td>
   93    *      </tr>
   94    *      <tr>
   95    *          <td>Producer</td>
   96    *          <td>From PDF meta-data if available</td>
   97    *      </tr>
   98    *      <tr>
   99    *          <td>Subject</td>
  100    *          <td>From PDF meta-data if available</td>
  101    *      </tr>
  102    *      <tr>
  103    *          <td>Trapped</td>
  104    *          <td>From PDF meta-data if available</td>
  105    *      </tr>
  106    * </table>
  107    *
  108    * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
  109    * @version $Revision: 1.23 $
  110    */
  111   public final class LucenePDFDocument
  112   {
  113       private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0);
  114   
  115       // given caveat of increased search times when using
  116       //MICROSECOND, only use SECOND by default
  117       private DateTools.Resolution dateTimeResolution = DateTools.Resolution.SECOND;
  118   
  119       private PDFTextStripper stripper = null;
  120   
  121       /**
  122        * Constructor.
  123        */
  124       public LucenePDFDocument()
  125       {
  126       }
  127   
  128       /**
  129        * Set the text stripper that will be used during extraction.
  130        *
  131        * @param aStripper The new pdf text stripper.
  132        */
  133       public void setTextStripper( PDFTextStripper aStripper )
  134       {
  135           stripper = aStripper;
  136       }
  137   
  138       /**
  139        * Get the Lucene data time resolution.
  140        *
  141        * @return current date/time resolution
  142        */
  143       public DateTools.Resolution getDateTimeResolution()
  144       {
  145           return dateTimeResolution;
  146       }
  147   
  148       /**
  149        * Set the Lucene data time resolution.
  150        *
  151        * @param resolution set new date/time resolution
  152        */
  153       public void setDateTimeResolution( DateTools.Resolution resolution )
  154       {
  155           dateTimeResolution = resolution;
  156       }
  157   
  158       //
  159       // compatibility methods for lucene-1.9+
  160       //
  161       private String timeToString( long time )
  162       {
  163           return DateTools.timeToString( time, dateTimeResolution );
  164       }
  165   
  166       private void addKeywordField( Document document, String name, String value )
  167       {
  168           if ( value != null )
  169           {
  170               document.add( new Field( name, value, Field.Store.YES, Field.Index.NOT_ANALYZED ) );
  171           }
  172       }
  173   
  174       private void addTextField( Document document, String name, Reader value )
  175       {
  176           if ( value != null )
  177           {
  178               document.add( new Field( name, value ) );
  179           }
  180       }
  181   
  182       private void addTextField( Document document, String name, String value )
  183       {
  184           if ( value != null )
  185           {
  186               document.add( new Field( name, value, Field.Store.YES, Field.Index.ANALYZED ) );
  187           }
  188       }
  189   
  190       private void addTextField( Document document, String name, Date value )
  191       {
  192           if ( value != null )
  193           {
  194               addTextField( document, name, DateTools.dateToString( value, dateTimeResolution ) );
  195           }
  196       }
  197   
  198       private void addTextField( Document document, String name, Calendar value )
  199       {
  200           if ( value != null )
  201           {
  202               addTextField( document, name, value.getTime() );
  203           }
  204       }
  205   
  206       private static void addUnindexedField( Document document, String name, String value )
  207       {
  208           if ( value != null )
  209           {
  210               document.add( new Field( name, value, Field.Store.YES, Field.Index.NO ) );
  211           }
  212       }
  213   
  214       private void addUnstoredKeywordField( Document document, String name, String value )
  215       {
  216           if ( value != null )
  217           {
  218               document.add( new Field( name, value, Field.Store.NO, Field.Index.NOT_ANALYZED ) );
  219           }
  220       }
  221   
  222       /**
  223        * Convert the PDF stream to a lucene document.
  224        *
  225        * @param is The input stream.
  226        * @return The input stream converted to a lucene document.
  227        * @throws IOException If there is an error converting the PDF.
  228        */
  229       public Document convertDocument( InputStream is ) throws IOException
  230       {
  231           Document document = new Document();
  232           addContent( document, is, "<inputstream>" );
  233           return document;
  234   
  235       }
  236   
  237       /**
  238        * This will take a reference to a PDF document and create a lucene document.
  239        *
  240        * @param file A reference to a PDF document.
  241        * @return The converted lucene document.
  242        *
  243        * @throws IOException If there is an exception while converting the document.
  244        */
  245       public Document convertDocument( File file ) throws IOException
  246       {
  247           Document document = new Document();
  248   
  249           // Add the url as a field named "url".  Use an UnIndexed field, so
  250           // that the url is just stored with the document, but is not searchable.
  251           addUnindexedField( document, "path", file.getPath() );
  252           addUnindexedField( document, "url", file.getPath().replace(FILE_SEPARATOR, '/') );
  253   
  254           // Add the last modified date of the file a field named "modified".  Use a
  255           // Keyword field, so that it's searchable, but so that no attempt is made
  256           // to tokenize the field into words.
  257           addKeywordField( document, "modified", timeToString( file.lastModified() ) );
  258   
  259           String uid = file.getPath().replace(FILE_SEPARATOR,'\u0000')
  260                        + "\u0000"
  261                        + timeToString( file.lastModified() );
  262   
  263           // Add the uid as a field, so that index can be incrementally maintained.
  264           // This field is not stored with document, it is indexed, but it is not
  265           // tokenized prior to indexing.
  266           addUnstoredKeywordField( document, "uid", uid );
  267   
  268           FileInputStream input = null;
  269           try
  270           {
  271               input = new FileInputStream( file );
  272               addContent( document, input, file.getPath() );
  273           }
  274           finally
  275           {
  276               if( input != null )
  277               {
  278                   input.close();
  279               }
  280           }
  281   
  282   
  283           // return the document
  284   
  285           return document;
  286       }
  287   
  288       /**
  289        * Convert the document from a PDF to a lucene document.
  290        *
  291        * @param url A url to a PDF document.
  292        * @return The PDF converted to a lucene document.
  293        * @throws IOException If there is an error while converting the document.
  294        */
  295       public Document convertDocument( URL url ) throws IOException
  296       {
  297           Document document = new Document();
  298           URLConnection connection = url.openConnection();
  299           connection.connect();
  300           // Add the url as a field named "url".  Use an UnIndexed field, so
  301           // that the url is just stored with the document, but is not searchable.
  302           addUnindexedField( document, "url", url.toExternalForm() );
  303   
  304           // Add the last modified date of the file a field named "modified".  Use a
  305           // Keyword field, so that it's searchable, but so that no attempt is made
  306           // to tokenize the field into words.
  307           addKeywordField( document, "modified", timeToString(connection.getLastModified() ) );
  308   
  309           String uid = url.toExternalForm().replace(FILE_SEPARATOR, '\u0000')
  310                        + "\u0000"
  311                        + timeToString( connection.getLastModified() );
  312   
  313           // Add the uid as a field, so that index can be incrementally maintained.
  314           // This field is not stored with document, it is indexed, but it is not
  315           // tokenized prior to indexing.
  316           addUnstoredKeywordField( document, "uid", uid );
  317   
  318           InputStream input = null;
  319           try
  320           {
  321               input = connection.getInputStream();
  322               addContent( document, input,url.toExternalForm() );
  323           }
  324           finally
  325           {
  326               if( input != null )
  327               {
  328                   input.close();
  329               }
  330           }
  331   
  332           // return the document
  333           return document;
  334       }
  335   
  336       /**
  337        * This will get a lucene document from a PDF file.
  338        *
  339        * @param is The stream to read the PDF from.
  340        *
  341        * @return The lucene document.
  342        *
  343        * @throws IOException If there is an error parsing or indexing the document.
  344        */
  345       public static Document getDocument( InputStream is ) throws IOException
  346       {
  347           LucenePDFDocument converter = new LucenePDFDocument();
  348           return converter.convertDocument( is );
  349       }
  350   
  351       /**
  352        * This will get a lucene document from a PDF file.
  353        *
  354        * @param file The file to get the document for.
  355        *
  356        * @return The lucene document.
  357        *
  358        * @throws IOException If there is an error parsing or indexing the document.
  359        */
  360       public static Document getDocument( File file ) throws IOException
  361       {
  362           LucenePDFDocument converter = new LucenePDFDocument();
  363           return converter.convertDocument( file );
  364       }
  365   
  366       /**
  367        * This will get a lucene document from a PDF file.
  368        *
  369        * @param url The file to get the document for.
  370        *
  371        * @return The lucene document.
  372        *
  373        * @throws IOException If there is an error parsing or indexing the document.
  374        */
  375       public static Document getDocument( URL url ) throws IOException
  376       {
  377           LucenePDFDocument converter = new LucenePDFDocument();
  378           return converter.convertDocument( url );
  379       }
  380   
  381       /**
  382        * This will add the contents to the lucene document.
  383        *
  384        * @param document The document to add the contents to.
  385        * @param is The stream to get the contents from.
  386        * @param documentLocation The location of the document, used just for debug messages.
  387        *
  388        * @throws IOException If there is an error parsing the document.
  389        */
  390       private void addContent( Document document, InputStream is, String documentLocation ) throws IOException
  391       {
  392           PDDocument pdfDocument = null;
  393           try
  394           {
  395               pdfDocument = PDDocument.load( is );
  396   
  397               if( pdfDocument.isEncrypted() )
  398               {
  399                   //Just try using the default password and move on
  400                   pdfDocument.decrypt( "" );
  401               }
  402   
  403               //create a writer where to append the text content.
  404               StringWriter writer = new StringWriter();
  405               if( stripper == null )
  406               {
  407                   stripper = new PDFTextStripper();
  408               }
  409               else
  410               {
  411                   stripper.resetEngine();
  412               }
  413               stripper.writeText( pdfDocument, writer );
  414   
  415               // Note: the buffer to string operation is costless;
  416               // the char array value of the writer buffer and the content string
  417               // is shared as long as the buffer content is not modified, which will
  418               // not occur here.
  419               String contents = writer.getBuffer().toString();
  420   
  421               StringReader reader = new StringReader( contents );
  422   
  423               // Add the tag-stripped contents as a Reader-valued Text field so it will
  424               // get tokenized and indexed.
  425               addTextField( document, "contents", reader );
  426   
  427               PDDocumentInformation info = pdfDocument.getDocumentInformation();
  428               if( info != null )
  429               {
  430                   addTextField( document, "Author", info.getAuthor() );
  431                   try
  432                   {
  433                       addTextField( document, "CreationDate", info.getCreationDate() );
  434                   }
  435                   catch( IOException io )
  436                   {
  437                       //ignore, bad date but continue with indexing
  438                   }
  439                   addTextField( document, "Creator", info.getCreator() );
  440                   addTextField( document, "Keywords", info.getKeywords() );
  441                   try
  442                   {
  443                       addTextField( document, "ModificationDate", info.getModificationDate() );
  444                   }
  445                   catch( IOException io )
  446                   {
  447                       //ignore, bad date but continue with indexing
  448                   }
  449                   addTextField( document, "Producer", info.getProducer() );
  450                   addTextField( document, "Subject", info.getSubject() );
  451                   addTextField( document, "Title", info.getTitle() );
  452                   addTextField( document, "Trapped", info.getTrapped() );
  453               }
  454               int summarySize = Math.min( contents.length(), 500 );
  455               String summary = contents.substring( 0, summarySize );
  456               // Add the summary as an UnIndexed field, so that it is stored and returned
  457               // with hit documents for display.
  458               addUnindexedField( document, "summary", summary );
  459           }
  460           catch( CryptographyException e )
  461           {
  462               throw new IOException( "Error decrypting document(" + documentLocation + "): " + e );
  463           }
  464           catch( InvalidPasswordException e )
  465           {
  466               //they didn't suppply a password and the default of "" was wrong.
  467               throw new IOException(
  468                   "Error: The document(" + documentLocation +
  469                   ") is encrypted and will not be indexed." );
  470           }
  471           finally
  472           {
  473               if( pdfDocument != null )
  474               {
  475                   pdfDocument.close();
  476               }
  477           }
  478       }
  479   
  480       /**
  481        * This will test creating a document.
  482        *
  483        * usage: java pdfparser.searchengine.lucene.LucenePDFDocument &lt;pdf-document&gt;
  484        *
  485        * @param args command line arguments.
  486        *
  487        * @throws IOException If there is an error.
  488        */
  489       public static void main( String[] args ) throws IOException
  490       {
  491           if( args.length != 1 )
  492           {
  493               String us = LucenePDFDocument.class.getName();
  494               System.err.println( "usage: java " + us + " <pdf-document>" );
  495               System.exit( 1 );
  496           }
  497           System.out.println( "Document=" + getDocument( new File( args[0] ) ) );
  498       }
  499   }

Home » pdfbox-1.1.0-src » org.apache.pdfbox.searchengine.lucene » [javadoc | source]