Home » pdfbox-1.1.0-src » org.apache.pdfbox.util » [javadoc | source]

    1   /*
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *      http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   package org.apache.pdfbox.util;
   18   
   19   import java.io.IOException;
   20   import java.io.StringWriter;
   21   import java.io.Writer;
   22   import java.util.ArrayList;
   23   import java.util.Collections;
   24   import java.util.HashMap;
   25   import java.util.Iterator;
   26   import java.util.List;
   27   import java.util.Map;
   28   import java.util.Properties;
   29   import java.util.Vector;
   30   
   31   import org.apache.pdfbox.cos.COSDocument;
   32   import org.apache.pdfbox.cos.COSStream;
   33   import org.apache.pdfbox.exceptions.CryptographyException;
   34   import org.apache.pdfbox.exceptions.InvalidPasswordException;
   35   import org.apache.pdfbox.exceptions.WrappedIOException;
   36   import org.apache.pdfbox.pdmodel.PDDocument;
   37   import org.apache.pdfbox.pdmodel.PDPage;
   38   import org.apache.pdfbox.pdmodel.common.PDRectangle;
   39   import org.apache.pdfbox.pdmodel.common.PDStream;
   40   import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
   41   import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
   42   
   43   
   44   /**
   45    * This class will take a pdf document and strip out all of the text and ignore the
   46    * formatting and such.  Please note; it is up to clients of this class to verify that
   47    * a specific user has the correct permissions to extract text from the
   48    * PDF document.
   49    * 
   50    * The basic flow of this process is that we get a document and use a series of 
   51    * processXXX() functions that work on smaller and smaller chunks of the page.  
   52    * Eventually, we fully process each page and then print it. 
   53    *
   54    * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
   55    * @version $Revision: 1.70 $
   56    */
   57   public class PDFTextStripper extends PDFStreamEngine
   58   {
   59       private int currentPageNo = 0;
   60       private int startPage = 1;
   61       private int endPage = Integer.MAX_VALUE;
   62       private PDOutlineItem startBookmark = null;
   63       private int startBookmarkPageNumber = -1;
   64       private PDOutlineItem endBookmark = null;
   65       private int endBookmarkPageNumber = -1;
   66       private boolean suppressDuplicateOverlappingText = true;
   67       private boolean shouldSeparateByBeads = true;
   68       private boolean sortByPosition = false;
   69       
   70       // We will need to estimate where to add spaces.  
   71       // These are used to help guess. 
   72       private float spacingTolerance = .5f;
   73       private float averageCharTolerance = .3f;
   74   
   75       private List pageArticles = null;
   76       /**
   77        * The charactersByArticle is used to extract text by article divisions.  For example
   78        * a PDF that has two columns like a newspaper, we want to extract the first column and
   79        * then the second column.  In this example the PDF would have 2 beads(or articles), one for
   80        * each column.  The size of the charactersByArticle would be 5, because not all text on the
   81        * screen will fall into one of the articles.  The five divisions are shown below
   82        *
   83        * Text before first article
   84        * first article text
   85        * text between first article and second article
   86        * second article text
   87        * text after second article
   88        *
   89        * Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
   90        */
   91       protected Vector charactersByArticle = new Vector();
   92   
   93       private Map characterListMapping = new HashMap();
   94   
   95       /**
   96        * The platforms lineseparator.
   97        */
   98       protected String lineSeparator = System.getProperty("line.separator");
   99       private String pageSeparator = System.getProperty("line.separator");
  100       private String wordSeparator = " ";
  101       /**
  102        * encoding that text will be written in (or null).
  103        */
  104       protected String outputEncoding; 
  105   
  106       /**
  107        * The document to read.
  108        */
  109       protected PDDocument document;
  110       /**
  111        * The stream to write the output to.
  112        */
  113       protected Writer output;
  114   
  115       /**
  116        * The normalizer is used to remove text ligatures/presentation forms
  117        * and to correct the direction of right to left text, such as Arabic and Hebrew.
  118        */
  119       private TextNormalize normalize = null;
  120   
  121       /**
  122        * Instantiate a new PDFTextStripper object.  This object will load properties from
  123        * Resources/PDFTextStripper.properties and will not do anything special to 
  124        * convert the text to a more encoding-specific output.  
  125        * @throws IOException If there is an error loading the properties.
  126        */
  127       public PDFTextStripper() throws IOException
  128       {
  129           super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties", true ) );
  130           this.outputEncoding = null;
  131           normalize = new TextNormalize(this.outputEncoding);
  132       }
  133   
  134   
  135       /**
  136        * Instantiate a new PDFTextStripper object.  Loading all of the operator mappings
  137        * from the properties object that is passed in.  Does not convert the text
  138        * to more encoding-specific output.
  139        *
  140        * @param props The properties containing the mapping of operators to PDFOperator
  141        * classes.
  142        *
  143        * @throws IOException If there is an error reading the properties.
  144        */
  145       public PDFTextStripper( Properties props ) throws IOException
  146       {
  147           super( props );
  148           this.outputEncoding = null;
  149           normalize = new TextNormalize(this.outputEncoding);
  150       }
  151       /**
  152        * Instantiate a new PDFTextStripper object. This object will load properties from
  153        * Resources/PDFTextStripper.properties and will apply encoding-specific
  154        * conversions to the output text.  
  155        *
  156        * @param encoding The encoding that the output will be written in.
  157        *
  158        * @throws IOException If there is an error reading the properties.
  159        */
  160       public PDFTextStripper( String encoding ) throws IOException
  161       {
  162           super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties", true ));
  163           this.outputEncoding = encoding;
  164           normalize = new TextNormalize(this.outputEncoding);
  165       }
  166   
  167       /**
  168        * This will return the text of a document.  See writeText. <br />
  169        * NOTE: The document must not be encrypted when coming into this method.
  170        *
  171        * @param doc The document to get the text from.
  172        *
  173        * @return The text of the PDF document.
  174        *
  175        * @throws IOException if the doc state is invalid or it is encrypted.
  176        */
  177       public String getText( PDDocument doc ) throws IOException
  178       {
  179           StringWriter outputStream = new StringWriter();
  180           writeText( doc, outputStream );
  181           return outputStream.toString();
  182       }
  183   
  184       /**
  185        * @deprecated
  186        * @see PDFTextStripper#getText( PDDocument )
  187        * @param doc The document to extract the text from.
  188        * @return The document text.
  189        * @throws IOException If there is an error extracting the text.
  190        */
  191       public String getText( COSDocument doc ) throws IOException
  192       {
  193           return getText( new PDDocument( doc ) );
  194       }
  195   
  196       /**
  197        * @deprecated
  198        * @see PDFTextStripper#writeText( PDDocument, Writer )
  199        * @param doc The document to extract the text.
  200        * @param outputStream The stream to write the text to.
  201        * @throws IOException If there is an error extracting the text.
  202        */
  203       public void writeText( COSDocument doc, Writer outputStream ) throws IOException
  204       {
  205           writeText( new PDDocument( doc ), outputStream );
  206       }
  207   
  208       /**
  209        * This will take a PDDocument and write the text of that document to the print writer.
  210        *
  211        * @param doc The document to get the data from.
  212        * @param outputStream The location to put the text.
  213        *
  214        * @throws IOException If the doc is in an invalid state.
  215        */
  216       public void writeText( PDDocument doc, Writer outputStream ) throws IOException
  217       {
  218           resetEngine();
  219   
  220           currentPageNo = 0;
  221           document = doc;
  222           output = outputStream;
  223           startDocument(document);
  224   
  225           if( document.isEncrypted() )
  226           {
  227               // We are expecting non-encrypted documents here, but it is common
  228               // for users to pass in a document that is encrypted with an empty
  229               // password (such a document appears to not be encrypted by
  230               // someone viewing the document, thus the confusion).  We will
  231               // attempt to decrypt with the empty password to handle this case.
  232               //
  233               try
  234               {
  235                   document.decrypt("");
  236               }
  237               catch (CryptographyException e)
  238               {
  239                   throw new WrappedIOException("Error decrypting document, details: ", e);
  240               }
  241               catch (InvalidPasswordException e)
  242               {
  243                   throw new WrappedIOException("Error: document is encrypted", e);
  244               }
  245           }
  246   
  247           processPages( document.getDocumentCatalog().getAllPages() );
  248           endDocument(document);
  249       }
  250   
  251       /**
  252        * This will process all of the pages and the text that is in them.
  253        *
  254        * @param pages The pages object in the document.
  255        *
  256        * @throws IOException If there is an error parsing the text.
  257        */
  258       protected void processPages( List pages ) throws IOException
  259       {
  260           if( startBookmark != null )
  261           {
  262               startBookmarkPageNumber = getPageNumber( startBookmark, pages );
  263           }
  264   
  265           if( endBookmark != null )
  266           {
  267               endBookmarkPageNumber = getPageNumber( endBookmark, pages );
  268           }
  269   
  270           if( startBookmarkPageNumber == -1 && startBookmark != null &&
  271                   endBookmarkPageNumber == -1 && endBookmark != null &&
  272                   startBookmark.getCOSObject() == endBookmark.getCOSObject() )
  273           {
  274               //this is a special case where both the start and end bookmark
  275               //are the same but point to nothing.  In this case
  276               //we will not extract any text.
  277               startBookmarkPageNumber = 0;
  278               endBookmarkPageNumber = 0;
  279           }
  280   
  281   
  282           Iterator pageIter = pages.iterator();
  283           while( pageIter.hasNext() )
  284           {
  285               PDPage nextPage = (PDPage)pageIter.next();
  286               PDStream contentStream = nextPage.getContents();
  287               currentPageNo++;
  288               if( contentStream != null )
  289               {
  290                   COSStream contents = contentStream.getStream();
  291                   processPage( nextPage, contents );
  292               }
  293           }
  294       }
  295   
  296       private int getPageNumber( PDOutlineItem bookmark, List allPages ) throws IOException
  297       {
  298           int pageNumber = -1;
  299           PDPage page = bookmark.findDestinationPage( document );
  300           if( page != null )
  301           {
  302               pageNumber = allPages.indexOf( page )+1;//use one based indexing
  303           }
  304           return pageNumber;
  305       }
  306   
  307       /**
  308        * This method is available for subclasses of this class.  It will be called before processing
  309        * of the document start.
  310        *
  311        * @param pdf The PDF document that is being processed.
  312        * @throws IOException If an IO error occurs.
  313        */
  314       protected void startDocument(PDDocument pdf) throws IOException
  315       {
  316           // no default implementation, but available for subclasses
  317       }
  318   
  319       /**
  320        * This method is available for subclasses of this class.  It will be called after processing
  321        * of the document finishes.
  322        *
  323        * @param pdf The PDF document that is being processed.
  324        * @throws IOException If an IO error occurs.
  325        */
  326       protected void endDocument(PDDocument pdf ) throws IOException
  327       {
  328           // no default implementation, but available for subclasses
  329       }
  330   
  331       /**
  332        * This will process the contents of a page.
  333        *
  334        * @param page The page to process.
  335        * @param content The contents of the page.
  336        *
  337        * @throws IOException If there is an error processing the page.
  338        */
  339       protected void processPage( PDPage page, COSStream content ) throws IOException
  340       {
  341           if( currentPageNo >= startPage && currentPageNo <= endPage &&
  342                   (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) &&
  343                   (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
  344           {
  345               startPage( page );
  346               pageArticles = page.getThreadBeads();
  347               int numberOfArticleSections = 1 + pageArticles.size() * 2;
  348               if( !shouldSeparateByBeads )
  349               {
  350                   numberOfArticleSections = 1;
  351               }
  352               int originalSize = charactersByArticle.size();
  353               charactersByArticle.setSize( numberOfArticleSections );
  354               for( int i=0; i<numberOfArticleSections; i++ )
  355               {
  356                   if( numberOfArticleSections < originalSize )
  357                   {
  358                       ((List)charactersByArticle.get( i )).clear();
  359                   }
  360                   else
  361                   {
  362                       charactersByArticle.set( i, new ArrayList() );
  363                   }
  364               }
  365   
  366               characterListMapping.clear();
  367               processStream( page, page.findResources(), content );
  368               writePage();
  369               endPage( page );
  370           }
  371   
  372       }
  373   
  374       /**
  375        * Start a new article, which is typically defined as a column
  376        * on a single page (also referred to as a bead).  This assumes
  377        * that the primary direction of text is left to right.  
  378        * Default implementation is to do nothing.  Subclasses
  379        * may provide additional information.
  380        *
  381        * @throws IOException If there is any error writing to the stream.
  382        */
  383       protected void startArticle() throws IOException
  384       {
  385           startArticle(true);
  386       }
  387   
  388       /**
  389        * Start a new article, which is typically defined as a column
  390        * on a single page (also referred to as a bead).  
  391        * Default implementation is to do nothing.  Subclasses
  392        * may provide additional information.
  393        *
  394        * @param isltr true if primary direction of text is left to right.
  395        * @throws IOException If there is any error writing to the stream.
  396        */
  397       protected void startArticle(boolean isltr) throws IOException
  398       {
  399           //default is to do nothing.
  400       }
  401   
  402       /**
  403        * End an article.  Default implementation is to do nothing.  Subclasses
  404        * may provide additional information.
  405        *
  406        * @throws IOException If there is any error writing to the stream.
  407        */
  408       protected void endArticle() throws IOException
  409       {
  410           //default is to do nothing
  411       }
  412   
  413       /**
  414        * Start a new page.  Default implementation is to do nothing.  Subclasses
  415        * may provide additional information.
  416        *
  417        * @param page The page we are about to process.
  418        *
  419        * @throws IOException If there is any error writing to the stream.
  420        */
  421       protected void startPage( PDPage page ) throws IOException
  422       {
  423           //default is to do nothing.
  424       }
  425   
  426       /**
  427        * End a page.  Default implementation is to do nothing.  Subclasses
  428        * may provide additional information.
  429        *
  430        * @param page The page we are about to process.
  431        *
  432        * @throws IOException If there is any error writing to the stream.
  433        */
  434       protected void endPage( PDPage page ) throws IOException
  435       {
  436           //default is to do nothing
  437       }
  438   
  439   
  440       /**
  441        * This will print the text of the processed page to "output".
  442        * It will estimate, based on the coordinates of the text, where
  443        * newlines and word spacings should be placed. The text will be
  444        * sorted only if that feature was enabled. 
  445        *
  446        * @throws IOException If there is an error writing the text.
  447        */
  448       protected void writePage() throws IOException    
  449       {
  450           float maxYForLine = -1;
  451           float minYTopForLine = Float.MAX_VALUE;
  452           float endOfLastTextX = -1;
  453           float lastWordSpacing = -1;
  454           float maxHeightForLine = -1;
  455           TextPosition lastPosition = null;
  456   
  457           for( int i = 0; i < charactersByArticle.size(); i++)
  458           {
  459               List textList = (List)charactersByArticle.get( i );
  460               if( sortByPosition )
  461               {
  462                   TextPositionComparator comparator = new TextPositionComparator();
  463                   Collections.sort( textList, comparator );
  464               }
  465   
  466               Iterator textIter = textList.iterator();
  467   
  468               /* Before we can display the text, we need to do some normalizing.
  469                * Arabic and Hebrew text is right to left and is typically stored
  470                * in its logical format, which means that the rightmost character is 
  471                * stored first, followed by the second character from the right etc.
  472                * However, PDF stores the text in presentation form, which is left to 
  473                * right.  We need to do some normalization to convert the PDF data to
  474                * the proper logical output format. 
  475                * 
  476                * Note that if we did not sort the text, then the output of reversing the
  477                * text is undefined and can sometimes produce worse output then not trying
  478                * to reverse the order.  Sorting should be done for these languages.
  479                * */
  480   
  481               /* First step is to determine if we have any right to left text, and 
  482                * if so, is it dominant. */ 
  483               int ltrCnt = 0;
  484               int rtlCnt = 0;
  485   
  486               while( textIter.hasNext() )
  487               {
  488                   TextPosition position = (TextPosition)textIter.next();
  489                   String stringValue = position.getCharacter();
  490   
  491                   for (int a = 0; a < stringValue.length(); a++) 
  492                   {
  493                       byte dir = Character.getDirectionality(stringValue.charAt(a));
  494                       if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ) || 
  495                               (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING) ||
  496                               (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE )) 
  497                       {
  498                           ltrCnt++;
  499                       }
  500                       else if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT ) ||
  501                               (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC) ||
  502                               (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING) ||
  503                               (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE )) 
  504                       {
  505                           rtlCnt++;
  506                       }
  507                   }
  508               }
  509   
  510               // choose the dominant direction
  511               boolean isRtlDominant = false; 
  512               if (rtlCnt > ltrCnt) 
  513               {
  514                   isRtlDominant = true;
  515               }
  516   
  517               startArticle(!isRtlDominant);
  518   
  519               // we will later use this to skip reordering
  520               boolean hasRtl = false;
  521               if (rtlCnt > 0)
  522               {
  523                   hasRtl = true;
  524               }
  525   
  526               /* Now cycle through to print the text.  
  527                * We queue up a line at a time before we print so that we can convert
  528                * the line from presentation form to logical form (if needed). */
  529               String lineStr = "";
  530   
  531               textIter = textList.iterator();    // start from the beginning again
  532   
  533               /* PDF files don't always store spaces. We will need to guess where we should add
  534                * spaces based on the distances between TextPositions. Historically, this was done
  535                * based on the size of the space character provided by the font. In general, this worked
  536                * but there were cases where it did not work. Calculating the average character width
  537                * and using that as a metric works better in some cases but fails in some cases where the 
  538                * spacing worked. So we use both. NOTE: Adobe reader also fails on some of these examples. 
  539                */
  540   
  541               //Keeps track of the previous average character width
  542               float previousAveCharWidth = -1;
  543               while( textIter.hasNext() )
  544               {
  545                   TextPosition position = (TextPosition)textIter.next();
  546                   String characterValue = position.getCharacter();
  547   
  548                   //Resets the average character width when we see a change in font 
  549                   // or a change in the font size
  550                   if(lastPosition != null && ((position.getFont() != lastPosition.getFont()) 
  551                           || (position.getFontSize() != lastPosition.getFontSize())))
  552                   {
  553                       previousAveCharWidth = -1;
  554                   }
  555   
  556                   float positionX;
  557                   float positionY;
  558                   float positionWidth;
  559                   float positionHeight;
  560   
  561                   /* If we are sorting, then we need to use the text direction
  562                    * adjusted coordinates, because they were used in the sorting. */
  563                   if (sortByPosition) 
  564                   {
  565                       positionX = position.getXDirAdj();
  566                       positionY = position.getYDirAdj();
  567                       positionWidth = position.getWidthDirAdj();
  568                       positionHeight = position.getHeightDir();
  569                   }
  570                   else 
  571                   {
  572                       positionX = position.getX();
  573                       positionY = position.getY();
  574                       positionWidth = position.getWidth();
  575                       positionHeight = position.getHeight();
  576                   }
  577   
  578                   //The current amount of characters in a word
  579                   int wordCharCount = position.getIndividualWidths().length;
  580   
  581                   /* Estimate the expected width of the space based on the 
  582                    * space character with some margin. */
  583                   float wordSpacing = position.getWidthOfSpace();
  584                   float deltaSpace = 0;
  585                   if ((wordSpacing == 0) || (wordSpacing == Float.NaN)) 
  586                   {
  587                       deltaSpace = Float.MAX_VALUE;
  588                   }
  589                   else 
  590                   {
  591                       if( lastWordSpacing < 0 )
  592                       {
  593                           deltaSpace = (wordSpacing * spacingTolerance);
  594                       }
  595                       else
  596                       {
  597                           deltaSpace = (((wordSpacing+lastWordSpacing)/2f)* spacingTolerance);
  598                       }
  599                   }
  600   
  601                   /* Estimate the expected width of the space based on the 
  602                    * average character width with some margin. This calculation does not
  603                    * make a true average (average of averages) but we found that it gave the
  604                    * best results after numerous experiments. Based on experiments we also found that
  605                    * .3 worked well. */                       
  606                   float averageCharWidth = -1;
  607                   if(previousAveCharWidth < 0)
  608                   {
  609                       averageCharWidth = (positionWidth/wordCharCount);
  610                   }
  611                   else
  612                   {
  613                       averageCharWidth = (previousAveCharWidth + (positionWidth/wordCharCount))/2f;
  614                   }
  615                   float deltaCharWidth = (averageCharWidth * averageCharTolerance);
  616                   
  617                   //Compares the values obtained by the average method and the wordSpacing method and picks
  618                   //the smaller number. 
  619                   float expectedStartOfNextWordX = -1;
  620                   if(endOfLastTextX != -1)
  621                   {
  622                       if(deltaCharWidth > deltaSpace)
  623                       {
  624                           expectedStartOfNextWordX = endOfLastTextX + deltaSpace;
  625                       }
  626                       else
  627                       {
  628                           expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth;
  629                       }
  630                   }   
  631   
  632                   if( lastPosition != null )
  633                   {  
  634                       // RDD - Here we determine whether this text object is on the current
  635                       // line.  We use the lastBaselineFontSize to handle the superscript
  636                       // case, and the size of the current font to handle the subscript case.
  637                       // Text must overlap with the last rendered baseline text by at least
  638                       // a small amount in order to be considered as being on the same line.
  639   
  640                       /* XXX BC: In theory, this check should really check if the next char is in full range
  641                        * seen in this line. This is what I tried to do with minYTopForLine, but this caused a lot
  642                        * of regression test failures.  So, I'm leaving it be for now. */
  643                       if(!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine))
  644                       {
  645                           // If we have RTL text on the page, change the direction
  646                           if (hasRtl)
  647                           {
  648                               lineStr = normalize.makeLineLogicalOrder(lineStr, isRtlDominant);
  649                           }
  650   
  651                           /* normalize string to remove presentation forms.
  652                            * Note that this must come after the line direction 
  653                            * conversion because the process looks ahead to the next
  654                            * logical character. 
  655                            */
  656                           lineStr = normalize.normalizePres(lineStr);
  657   
  658                           writeString(lineStr);
  659                           lineStr = "";
  660   
  661                           writeLineSeparator( );
  662   
  663                           endOfLastTextX = -1;
  664                           expectedStartOfNextWordX = -1;
  665                           maxYForLine = -1;
  666                           maxHeightForLine = -1;
  667                           minYTopForLine = Float.MAX_VALUE;
  668                       }
  669   
  670                       //Test if our TextPosition starts after a new word would be expected to start. 
  671                       if (expectedStartOfNextWordX != -1 && expectedStartOfNextWordX < positionX &&
  672                               //only bother adding a space if the last character was not a space
  673                               lastPosition.getCharacter() != null &&
  674                               !lastPosition.getCharacter().endsWith( " " ) ) 
  675                       {
  676                           lineStr += getWordSeparator();
  677                       }
  678                   }
  679   
  680                   if (positionY >= maxYForLine) 
  681                   {
  682                       maxYForLine = positionY;
  683                   }
  684   
  685                   // RDD - endX is what PDF considers to be the x coordinate of the
  686                   // end position of the text.  We use it in computing our metrics below.
  687                   endOfLastTextX = positionX + positionWidth;
  688   
  689                   // add it to the list
  690                   if (characterValue != null) 
  691                   {
  692                       lineStr += characterValue;
  693                   }
  694                   maxHeightForLine = Math.max( maxHeightForLine, positionHeight );
  695                   minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
  696                   lastPosition = position;
  697                   lastWordSpacing = wordSpacing;
  698                   previousAveCharWidth = averageCharWidth;
  699               }
  700   
  701               // print the final line
  702               if (lineStr.length() > 0) 
  703               {
  704                   if (hasRtl)
  705                   {
  706                       lineStr = normalize.makeLineLogicalOrder(lineStr, isRtlDominant);
  707                   }
  708   
  709                   // normalize string to remove presentation forms
  710                   lineStr = normalize.normalizePres(lineStr);
  711   
  712                   writeString(lineStr);
  713               }
  714   
  715               endArticle();
  716           }
  717           writePageSeperator();
  718       }
  719   
  720       private boolean overlap( float y1, float height1, float y2, float height2 )
  721       {
  722           return within( y1, y2, .1f) || (y2 <= y1 && y2 >= y1-height1) ||
  723           (y1 <= y2 && y1 >= y2-height2);
  724       }
  725   
  726       /**
  727        * Write the page separator value to the output stream.
  728        * @throws IOException
  729        *             If there is a problem writing out the pageseparator to the document.
  730        */
  731       protected void writePageSeperator() throws IOException
  732       {
  733           // RDD - newline at end of flush - required for end of page (so that the top
  734           // of the next page starts on its own line.
  735           //
  736           output.write(getPageSeparator());
  737           output.flush();
  738       }
  739   
  740       /**
  741        * Write the line separator value to the output stream.
  742        * @throws IOException
  743        *             If there is a problem writing out the lineseparator to the document.
  744        */
  745       protected void writeLineSeparator( ) throws IOException
  746       {
  747           output.write(getLineSeparator());
  748       }
  749   
  750   
  751       /**
  752        * Write the word separator value to the output stream.
  753        * @throws IOException
  754        *             If there is a problem writing out the wordseparator to the document.
  755        */
  756       protected void writeWordSeparator() throws IOException
  757       {
  758           output.write(getWordSeparator());
  759       }
  760   
  761       /**
  762        * Write the string in TextPosition to the output stream.
  763        *
  764        * @param text The text to write to the stream.
  765        * @throws IOException If there is an error when writing the text.
  766        */
  767       protected void writeCharacters( TextPosition text ) throws IOException
  768       {
  769           output.write( text.getCharacter() );
  770       }
  771   
  772       /**
  773        * Write a Java string to the output stream.
  774        *
  775        * @param text The text to write to the stream.
  776        * @throws IOException If there is an error when writing the text.
  777        */
  778       protected void writeString( String text ) throws IOException
  779       {
  780           output.write( text );
  781       }
  782   
  783       /**
  784        * This will determine of two floating point numbers are within a specified variance.
  785        *
  786        * @param first The first number to compare to.
  787        * @param second The second number to compare to.
  788        * @param variance The allowed variance.
  789        */
  790       private boolean within( float first, float second, float variance )
  791       {
  792           return second < first + variance && second > first - variance;
  793       }
  794   
  795   
  796       /**
  797        * This will process a TextPosition object and add the
  798        * text to the list of characters on a page.  It takes care of
  799        * overlapping text.
  800        *
  801        * @param text The text to process.
  802        */
  803       protected void processTextPosition( TextPosition text )
  804       {
  805           boolean showCharacter = true;
  806           if( suppressDuplicateOverlappingText )
  807           {
  808               showCharacter = false;
  809               String textCharacter = text.getCharacter();
  810               float textX = text.getX();
  811               float textY = text.getY();
  812               List sameTextCharacters = (List)characterListMapping.get( textCharacter );
  813               if( sameTextCharacters == null )
  814               {
  815                   sameTextCharacters = new ArrayList();
  816                   characterListMapping.put( textCharacter, sameTextCharacters );
  817               }
  818   
  819               // RDD - Here we compute the value that represents the end of the rendered
  820               // text.  This value is used to determine whether subsequent text rendered
  821               // on the same line overwrites the current text.
  822               //
  823               // We subtract any positive padding to handle cases where extreme amounts
  824               // of padding are applied, then backed off (not sure why this is done, but there
  825               // are cases where the padding is on the order of 10x the character width, and
  826               // the TJ just backs up to compensate after each character).  Also, we subtract
  827               // an amount to allow for kerning (a percentage of the width of the last
  828               // character).
  829               //
  830               boolean suppressCharacter = false;
  831               float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
  832               for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
  833               {
  834                   TextPosition character = (TextPosition)sameTextCharacters.get( i );
  835                   String charCharacter = character.getCharacter();
  836                   float charX = character.getX();
  837                   float charY = character.getY();
  838                   //only want to suppress
  839   
  840                   if( charCharacter != null &&
  841                           //charCharacter.equals( textCharacter ) &&
  842                           within( charX, textX, tolerance ) &&
  843                           within( charY,
  844                                   textY,
  845                                   tolerance ) )
  846                   {
  847                       suppressCharacter = true;
  848                   }
  849               }
  850               if( !suppressCharacter )
  851               {
  852                   sameTextCharacters.add( text );
  853                   showCharacter = true;
  854               }
  855           }
  856   
  857           if( showCharacter )
  858           {
  859               //if we are showing the character then we need to determine which
  860               //article it belongs to.
  861               int foundArticleDivisionIndex = -1;
  862               int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
  863               int notFoundButFirstLeftArticleDivisionIndex = -1;
  864               int notFoundButFirstAboveArticleDivisionIndex = -1;
  865               float x = text.getX();
  866               float y = text.getY();
  867               if( shouldSeparateByBeads )
  868               {
  869                   for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
  870                   {
  871                       PDThreadBead bead = (PDThreadBead)pageArticles.get( i );
  872                       if( bead != null )
  873                       {
  874                           PDRectangle rect = bead.getRectangle();
  875                           if( rect.contains( x, y ) )
  876                           {
  877                               foundArticleDivisionIndex = i*2+1;
  878                           }
  879                           else if( (x < rect.getLowerLeftX() ||
  880                                   y < rect.getUpperRightY()) &&
  881                                   notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
  882                           {
  883                               notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
  884                           }
  885                           else if( x < rect.getLowerLeftX() &&
  886                                   notFoundButFirstLeftArticleDivisionIndex == -1)
  887                           {
  888                               notFoundButFirstLeftArticleDivisionIndex = i*2;
  889                           }
  890                           else if( y < rect.getUpperRightY() &&
  891                                   notFoundButFirstAboveArticleDivisionIndex == -1)
  892                           {
  893                               notFoundButFirstAboveArticleDivisionIndex = i*2;
  894                           }
  895                       }
  896                       else
  897                       {
  898                           foundArticleDivisionIndex = 0;
  899                       }
  900                   }
  901               }
  902               else
  903               {
  904                   foundArticleDivisionIndex = 0;
  905               }
  906               int articleDivisionIndex = -1;
  907               if( foundArticleDivisionIndex != -1 )
  908               {
  909                   articleDivisionIndex = foundArticleDivisionIndex;
  910               }
  911               else if( notFoundButFirstLeftAndAboveArticleDivisionIndex != -1 )
  912               {
  913                   articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
  914               }
  915               else if( notFoundButFirstLeftArticleDivisionIndex != -1 )
  916               {
  917                   articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
  918               }
  919               else if( notFoundButFirstAboveArticleDivisionIndex != -1 )
  920               {
  921                   articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
  922               }
  923               else
  924               {
  925                   articleDivisionIndex = charactersByArticle.size()-1;
  926               }
  927   
  928               List textList = (List) charactersByArticle.get( articleDivisionIndex );
  929   
  930               /* In the wild, some PDF encoded documents put diacritics (accents on
  931                * top of characters) into a separate Tj element.  When displaying them
  932                * graphically, the two chunks get overlayed.  With text output though,
  933                * we need to do the overlay. This code recombines the diacritic with
  934                * its associated character if the two are consecutive.
  935                */ 
  936               if(textList.isEmpty())
  937               {
  938                   textList.add(text);
  939               }
  940               else
  941               {
  942                   /* test if we overlap the previous entry.  
  943                    * Note that we are making an assumption that we need to only look back
  944                    * one TextPosition to find what we are overlapping.  
  945                    * This may not always be true. */
  946                   TextPosition previousTextPosition = (TextPosition)textList.get(textList.size()-1);
  947                   if(text.isDiacritic() && previousTextPosition.contains(text))
  948                   {
  949                       previousTextPosition.mergeDiacritic(text, normalize);
  950                   }
  951                   /* If the previous TextPosition was the diacritic, merge it into this
  952                    * one and remove it from the list. */
  953                   else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
  954                   {
  955                       text.mergeDiacritic(previousTextPosition, normalize);
  956                       textList.remove(textList.size()-1);
  957                       textList.add(text);
  958                   }
  959                   else
  960                   {
  961                       textList.add(text);
  962                   }
  963               }
  964           }
  965       }
  966   
  967       /**
  968        * This is the page that the text extraction will start on.  The pages start
  969        * at page 1.  For example in a 5 page PDF document, if the start page is 1
  970        * then all pages will be extracted.  If the start page is 4 then pages 4 and 5
  971        * will be extracted.  The default value is 1.
  972        *
  973        * @return Value of property startPage.
  974        */
  975       public int getStartPage()
  976       {
  977           return startPage;
  978       }
  979   
  980       /**
  981        * This will set the first page to be extracted by this class.
  982        *
  983        * @param startPageValue New value of property startPage.
  984        */
  985       public void setStartPage(int startPageValue)
  986       {
  987           startPage = startPageValue;
  988       }
  989   
  990       /**
  991        * This will get the last page that will be extracted.  This is inclusive,
  992        * for example if a 5 page PDF an endPage value of 5 would extract the
  993        * entire document, an end page of 2 would extract pages 1 and 2.  This defaults
  994        * to Integer.MAX_VALUE such that all pages of the pdf will be extracted.
  995        *
  996        * @return Value of property endPage.
  997        */
  998       public int getEndPage()
  999       {
 1000           return endPage;
 1001       }
 1002   
 1003       /**
 1004        * This will set the last page to be extracted by this class.
 1005        *
 1006        * @param endPageValue New value of property endPage.
 1007        */
 1008       public void setEndPage(int endPageValue)
 1009       {
 1010           endPage = endPageValue;
 1011       }
 1012   
 1013       /**
 1014        * Set the desired line separator for output text.  The line.separator
 1015        * system property is used if the line separator preference is not set
 1016        * explicitly using this method.
 1017        *
 1018        * @param separator The desired line separator string.
 1019        */
 1020       public void setLineSeparator(String separator)
 1021       {
 1022           lineSeparator = separator;
 1023       }
 1024   
 1025       /**
 1026        * This will get the line separator.
 1027        *
 1028        * @return The desired line separator string.
 1029        */
 1030       public String getLineSeparator()
 1031       {
 1032           return lineSeparator;
 1033       }
 1034   
 1035       /**
 1036        * Set the desired page separator for output text.  The line.separator
 1037        * system property is used if the page separator preference is not set
 1038        * explicitly using this method.
 1039        *
 1040        * @param separator The desired page separator string.
 1041        */
 1042       public void setPageSeparator(String separator)
 1043       {
 1044           pageSeparator = separator;
 1045       }
 1046   
 1047       /**
 1048        * This will get the word separator.
 1049        *
 1050        * @return The desired word separator string.
 1051        */
 1052       public String getWordSeparator()
 1053       {
 1054           return wordSeparator;
 1055       }
 1056   
 1057       /**
 1058        * Set the desired word separator for output text.  The PDFBox text extraction
 1059        * algorithm will output a space character if there is enough space between
 1060        * two words.  By default a space character is used.  If you need and accurate
 1061        * count of characters that are found in a PDF document then you might want to
 1062        * set the word separator to the empty string.
 1063        *
 1064        * @param separator The desired page separator string.
 1065        */
 1066       public void setWordSeparator(String separator)
 1067       {
 1068           wordSeparator = separator;
 1069       }
 1070   
 1071       /**
 1072        * This will get the page separator.
 1073        *
 1074        * @return The page separator string.
 1075        */
 1076       public String getPageSeparator()
 1077       {
 1078           return pageSeparator;
 1079       }
 1080       /**
 1081        * @return Returns the suppressDuplicateOverlappingText.
 1082        */
 1083       public boolean shouldSuppressDuplicateOverlappingText()
 1084       {
 1085           return suppressDuplicateOverlappingText;
 1086       }
 1087   
 1088       /**
 1089        * Get the current page number that is being processed.
 1090        *
 1091        * @return A 1 based number representing the current page.
 1092        */
 1093       protected int getCurrentPageNo()
 1094       {
 1095           return currentPageNo;
 1096       }
 1097   
 1098       /**
 1099        * The output stream that is being written to.
 1100        *
 1101        * @return The stream that output is being written to.
 1102        */
 1103       protected Writer getOutput()
 1104       {
 1105           return output;
 1106       }
 1107   
 1108       /**
 1109        * Character strings are grouped by articles.  It is quite common that there
 1110        * will only be a single article.  This returns a List that contains List objects,
 1111        * the inner lists will contain TextPosition objects.
 1112        *
 1113        * @return A double List of TextPositions for all text strings on the page.
 1114        */
 1115       protected List getCharactersByArticle()
 1116       {
 1117           return charactersByArticle;
 1118       }
 1119   
 1120       /**
 1121        * By default the text stripper will attempt to remove text that overlapps each other.
 1122        * Word paints the same character several times in order to make it look bold.  By setting
 1123        * this to false all text will be extracted, which means that certain sections will be
 1124        * duplicated, but better performance will be noticed.
 1125        *
 1126        * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set.
 1127        */
 1128       public void setSuppressDuplicateOverlappingText(
 1129               boolean suppressDuplicateOverlappingTextValue)
 1130       {
 1131           this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
 1132       }
 1133   
 1134       /**
 1135        * This will tell if the text stripper should separate by beads.
 1136        *
 1137        * @return If the text will be grouped by beads.
 1138        */
 1139       public boolean shouldSeparateByBeads()
 1140       {
 1141           return shouldSeparateByBeads;
 1142       }
 1143   
 1144       /**
 1145        * Set if the text stripper should group the text output by a list of beads.  The default value is true!
 1146        *
 1147        * @param aShouldSeparateByBeads The new grouping of beads.
 1148        */
 1149       public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
 1150       {
 1151           this.shouldSeparateByBeads = aShouldSeparateByBeads;
 1152       }
 1153   
 1154       /**
 1155        * Get the bookmark where text extraction should end, inclusive.  Default is null.
 1156        *
 1157        * @return The ending bookmark.
 1158        */
 1159       public PDOutlineItem getEndBookmark()
 1160       {
 1161           return endBookmark;
 1162       }
 1163   
 1164       /**
 1165        * Set the bookmark where the text extraction should stop.
 1166        *
 1167        * @param aEndBookmark The ending bookmark.
 1168        */
 1169       public void setEndBookmark(PDOutlineItem aEndBookmark)
 1170       {
 1171           endBookmark = aEndBookmark;
 1172       }
 1173   
 1174       /**
 1175        * Get the bookmark where text extraction should start, inclusive.  Default is null.
 1176        *
 1177        * @return The starting bookmark.
 1178        */
 1179       public PDOutlineItem getStartBookmark()
 1180       {
 1181           return startBookmark;
 1182       }
 1183   
 1184       /**
 1185        * Set the bookmark where text extraction should start, inclusive.
 1186        *
 1187        * @param aStartBookmark The starting bookmark.
 1188        */
 1189       public void setStartBookmark(PDOutlineItem aStartBookmark)
 1190       {
 1191           startBookmark = aStartBookmark;
 1192       }
 1193   
 1194       /**
 1195        * This will tell if the text stripper should sort the text tokens
 1196        * before writing to the stream.
 1197        *
 1198        * @return true If the text tokens will be sorted before being written.
 1199        */
 1200       public boolean shouldSortByPosition()
 1201       {
 1202           return sortByPosition;
 1203       }
 1204   
 1205       /**
 1206        * The order of the text tokens in a PDF file may not be in the same
 1207        * as they appear visually on the screen.  For example, a PDF writer may
 1208        * write out all text by font, so all bold or larger text, then make a second
 1209        * pass and write out the normal text.<br/>
 1210        * The default is to <b>not</b> sort by position.<br/>
 1211        * <br/>
 1212        * A PDF writer could choose to write each character in a different order.  By
 1213        * default PDFBox does <b>not</b> sort the text tokens before processing them due to
 1214        * performance reasons.
 1215        *
 1216        * @param newSortByPosition Tell PDFBox to sort the text positions.
 1217        */
 1218       public void setSortByPosition(boolean newSortByPosition)
 1219       {
 1220           sortByPosition = newSortByPosition;
 1221       }
 1222   
 1223       /**
 1224        * Get the current space width-based tolerance value that is being used
 1225        * to estimate where spaces in text should be added.  Note that the
 1226        * default value for this has been determined from trial and error. 
 1227        * 
 1228        * @return The current tolerance / scaling factor
 1229        */
 1230       public float getSpacingTolerance() 
 1231       {
 1232           return spacingTolerance;
 1233       }
 1234   
 1235       /**
 1236        * Set the space width-based tolerance value that is used
 1237        * to estimate where spaces in text should be added.  Note that the
 1238        * default value for this has been determined from trial and error.
 1239        * Setting this value larger will reduce the number of spaces added. 
 1240        * 
 1241        * @param spacingToleranceValue tolerance / scaling factor to use
 1242        */
 1243       public void setSpacingTolerance(float spacingToleranceValue)
 1244       {
 1245           this.spacingTolerance = spacingToleranceValue;
 1246       }
 1247   
 1248       /**
 1249        * Get the current character width-based tolerance value that is being used
 1250        * to estimate where spaces in text should be added.  Note that the
 1251        * default value for this has been determined from trial and error.
 1252        * 
 1253        * @return The current tolerance / scaling factor
 1254        */
 1255       public float getAverageCharTolerance() 
 1256       {
 1257           return averageCharTolerance;
 1258       }
 1259   
 1260       /**
 1261        * Set the character width-based tolerance value that is used
 1262        * to estimate where spaces in text should be added.  Note that the
 1263        * default value for this has been determined from trial and error.
 1264        * Setting this value larger will reduce the number of spaces added. 
 1265        * 
 1266        * @param averageCharToleranceValue average tolerance / scaling factor to use
 1267        */
 1268       public void setAverageCharTolerance(float averageCharToleranceValue) 
 1269       {
 1270           this.averageCharTolerance = averageCharToleranceValue;
 1271       }
 1272   }

Home » pdfbox-1.1.0-src » org.apache.pdfbox.util » [javadoc | source]