1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.pdfbox.util; 18 19 import java.io.IOException; 20 import java.io.StringWriter; 21 import java.io.Writer; 22 import java.util.ArrayList; 23 import java.util.Collections; 24 import java.util.HashMap; 25 import java.util.Iterator; 26 import java.util.List; 27 import java.util.Map; 28 import java.util.Properties; 29 import java.util.Vector; 30 31 import org.apache.pdfbox.cos.COSDocument; 32 import org.apache.pdfbox.cos.COSStream; 33 import org.apache.pdfbox.exceptions.CryptographyException; 34 import org.apache.pdfbox.exceptions.InvalidPasswordException; 35 import org.apache.pdfbox.exceptions.WrappedIOException; 36 import org.apache.pdfbox.pdmodel.PDDocument; 37 import org.apache.pdfbox.pdmodel.PDPage; 38 import org.apache.pdfbox.pdmodel.common.PDRectangle; 39 import org.apache.pdfbox.pdmodel.common.PDStream; 40 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; 41 import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; 42 43 44 /** 45 * This class will take a pdf document and strip out all of the text and ignore the 46 * formatting and such. Please note; it is up to clients of this class to verify that 47 * a specific user has the correct permissions to extract text from the 48 * PDF document. 49 * 50 * The basic flow of this process is that we get a document and use a series of 51 * processXXX() functions that work on smaller and smaller chunks of the page. 52 * Eventually, we fully process each page and then print it. 53 * 54 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> 55 * @version $Revision: 1.70 $ 56 */ 57 public class PDFTextStripper extends PDFStreamEngine 58 { 59 private int currentPageNo = 0; 60 private int startPage = 1; 61 private int endPage = Integer.MAX_VALUE; 62 private PDOutlineItem startBookmark = null; 63 private int startBookmarkPageNumber = -1; 64 private PDOutlineItem endBookmark = null; 65 private int endBookmarkPageNumber = -1; 66 private boolean suppressDuplicateOverlappingText = true; 67 private boolean shouldSeparateByBeads = true; 68 private boolean sortByPosition = false; 69 70 // We will need to estimate where to add spaces. 71 // These are used to help guess. 72 private float spacingTolerance = .5f; 73 private float averageCharTolerance = .3f; 74 75 private List pageArticles = null; 76 /** 77 * The charactersByArticle is used to extract text by article divisions. For example 78 * a PDF that has two columns like a newspaper, we want to extract the first column and 79 * then the second column. In this example the PDF would have 2 beads(or articles), one for 80 * each column. The size of the charactersByArticle would be 5, because not all text on the 81 * screen will fall into one of the articles. The five divisions are shown below 82 * 83 * Text before first article 84 * first article text 85 * text between first article and second article 86 * second article text 87 * text after second article 88 * 89 * Most PDFs won't have any beads, so charactersByArticle will contain a single entry. 90 */ 91 protected Vector charactersByArticle = new Vector(); 92 93 private Map characterListMapping = new HashMap(); 94 95 /** 96 * The platforms lineseparator. 97 */ 98 protected String lineSeparator = System.getProperty("line.separator"); 99 private String pageSeparator = System.getProperty("line.separator"); 100 private String wordSeparator = " "; 101 /** 102 * encoding that text will be written in (or null). 103 */ 104 protected String outputEncoding; 105 106 /** 107 * The document to read. 108 */ 109 protected PDDocument document; 110 /** 111 * The stream to write the output to. 112 */ 113 protected Writer output; 114 115 /** 116 * The normalizer is used to remove text ligatures/presentation forms 117 * and to correct the direction of right to left text, such as Arabic and Hebrew. 118 */ 119 private TextNormalize normalize = null; 120 121 /** 122 * Instantiate a new PDFTextStripper object. This object will load properties from 123 * Resources/PDFTextStripper.properties and will not do anything special to 124 * convert the text to a more encoding-specific output. 125 * @throws IOException If there is an error loading the properties. 126 */ 127 public PDFTextStripper() throws IOException 128 { 129 super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties", true ) ); 130 this.outputEncoding = null; 131 normalize = new TextNormalize(this.outputEncoding); 132 } 133 134 135 /** 136 * Instantiate a new PDFTextStripper object. Loading all of the operator mappings 137 * from the properties object that is passed in. Does not convert the text 138 * to more encoding-specific output. 139 * 140 * @param props The properties containing the mapping of operators to PDFOperator 141 * classes. 142 * 143 * @throws IOException If there is an error reading the properties. 144 */ 145 public PDFTextStripper( Properties props ) throws IOException 146 { 147 super( props ); 148 this.outputEncoding = null; 149 normalize = new TextNormalize(this.outputEncoding); 150 } 151 /** 152 * Instantiate a new PDFTextStripper object. This object will load properties from 153 * Resources/PDFTextStripper.properties and will apply encoding-specific 154 * conversions to the output text. 155 * 156 * @param encoding The encoding that the output will be written in. 157 * 158 * @throws IOException If there is an error reading the properties. 159 */ 160 public PDFTextStripper( String encoding ) throws IOException 161 { 162 super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties", true )); 163 this.outputEncoding = encoding; 164 normalize = new TextNormalize(this.outputEncoding); 165 } 166 167 /** 168 * This will return the text of a document. See writeText. <br /> 169 * NOTE: The document must not be encrypted when coming into this method. 170 * 171 * @param doc The document to get the text from. 172 * 173 * @return The text of the PDF document. 174 * 175 * @throws IOException if the doc state is invalid or it is encrypted. 176 */ 177 public String getText( PDDocument doc ) throws IOException 178 { 179 StringWriter outputStream = new StringWriter(); 180 writeText( doc, outputStream ); 181 return outputStream.toString(); 182 } 183 184 /** 185 * @deprecated 186 * @see PDFTextStripper#getText( PDDocument ) 187 * @param doc The document to extract the text from. 188 * @return The document text. 189 * @throws IOException If there is an error extracting the text. 190 */ 191 public String getText( COSDocument doc ) throws IOException 192 { 193 return getText( new PDDocument( doc ) ); 194 } 195 196 /** 197 * @deprecated 198 * @see PDFTextStripper#writeText( PDDocument, Writer ) 199 * @param doc The document to extract the text. 200 * @param outputStream The stream to write the text to. 201 * @throws IOException If there is an error extracting the text. 202 */ 203 public void writeText( COSDocument doc, Writer outputStream ) throws IOException 204 { 205 writeText( new PDDocument( doc ), outputStream ); 206 } 207 208 /** 209 * This will take a PDDocument and write the text of that document to the print writer. 210 * 211 * @param doc The document to get the data from. 212 * @param outputStream The location to put the text. 213 * 214 * @throws IOException If the doc is in an invalid state. 215 */ 216 public void writeText( PDDocument doc, Writer outputStream ) throws IOException 217 { 218 resetEngine(); 219 220 currentPageNo = 0; 221 document = doc; 222 output = outputStream; 223 startDocument(document); 224 225 if( document.isEncrypted() ) 226 { 227 // We are expecting non-encrypted documents here, but it is common 228 // for users to pass in a document that is encrypted with an empty 229 // password (such a document appears to not be encrypted by 230 // someone viewing the document, thus the confusion). We will 231 // attempt to decrypt with the empty password to handle this case. 232 // 233 try 234 { 235 document.decrypt(""); 236 } 237 catch (CryptographyException e) 238 { 239 throw new WrappedIOException("Error decrypting document, details: ", e); 240 } 241 catch (InvalidPasswordException e) 242 { 243 throw new WrappedIOException("Error: document is encrypted", e); 244 } 245 } 246 247 processPages( document.getDocumentCatalog().getAllPages() ); 248 endDocument(document); 249 } 250 251 /** 252 * This will process all of the pages and the text that is in them. 253 * 254 * @param pages The pages object in the document. 255 * 256 * @throws IOException If there is an error parsing the text. 257 */ 258 protected void processPages( List pages ) throws IOException 259 { 260 if( startBookmark != null ) 261 { 262 startBookmarkPageNumber = getPageNumber( startBookmark, pages ); 263 } 264 265 if( endBookmark != null ) 266 { 267 endBookmarkPageNumber = getPageNumber( endBookmark, pages ); 268 } 269 270 if( startBookmarkPageNumber == -1 && startBookmark != null && 271 endBookmarkPageNumber == -1 && endBookmark != null && 272 startBookmark.getCOSObject() == endBookmark.getCOSObject() ) 273 { 274 //this is a special case where both the start and end bookmark 275 //are the same but point to nothing. In this case 276 //we will not extract any text. 277 startBookmarkPageNumber = 0; 278 endBookmarkPageNumber = 0; 279 } 280 281 282 Iterator pageIter = pages.iterator(); 283 while( pageIter.hasNext() ) 284 { 285 PDPage nextPage = (PDPage)pageIter.next(); 286 PDStream contentStream = nextPage.getContents(); 287 currentPageNo++; 288 if( contentStream != null ) 289 { 290 COSStream contents = contentStream.getStream(); 291 processPage( nextPage, contents ); 292 } 293 } 294 } 295 296 private int getPageNumber( PDOutlineItem bookmark, List allPages ) throws IOException 297 { 298 int pageNumber = -1; 299 PDPage page = bookmark.findDestinationPage( document ); 300 if( page != null ) 301 { 302 pageNumber = allPages.indexOf( page )+1;//use one based indexing 303 } 304 return pageNumber; 305 } 306 307 /** 308 * This method is available for subclasses of this class. It will be called before processing 309 * of the document start. 310 * 311 * @param pdf The PDF document that is being processed. 312 * @throws IOException If an IO error occurs. 313 */ 314 protected void startDocument(PDDocument pdf) throws IOException 315 { 316 // no default implementation, but available for subclasses 317 } 318 319 /** 320 * This method is available for subclasses of this class. It will be called after processing 321 * of the document finishes. 322 * 323 * @param pdf The PDF document that is being processed. 324 * @throws IOException If an IO error occurs. 325 */ 326 protected void endDocument(PDDocument pdf ) throws IOException 327 { 328 // no default implementation, but available for subclasses 329 } 330 331 /** 332 * This will process the contents of a page. 333 * 334 * @param page The page to process. 335 * @param content The contents of the page. 336 * 337 * @throws IOException If there is an error processing the page. 338 */ 339 protected void processPage( PDPage page, COSStream content ) throws IOException 340 { 341 if( currentPageNo >= startPage && currentPageNo <= endPage && 342 (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) && 343 (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber )) 344 { 345 startPage( page ); 346 pageArticles = page.getThreadBeads(); 347 int numberOfArticleSections = 1 + pageArticles.size() * 2; 348 if( !shouldSeparateByBeads ) 349 { 350 numberOfArticleSections = 1; 351 } 352 int originalSize = charactersByArticle.size(); 353 charactersByArticle.setSize( numberOfArticleSections ); 354 for( int i=0; i<numberOfArticleSections; i++ ) 355 { 356 if( numberOfArticleSections < originalSize ) 357 { 358 ((List)charactersByArticle.get( i )).clear(); 359 } 360 else 361 { 362 charactersByArticle.set( i, new ArrayList() ); 363 } 364 } 365 366 characterListMapping.clear(); 367 processStream( page, page.findResources(), content ); 368 writePage(); 369 endPage( page ); 370 } 371 372 } 373 374 /** 375 * Start a new article, which is typically defined as a column 376 * on a single page (also referred to as a bead). This assumes 377 * that the primary direction of text is left to right. 378 * Default implementation is to do nothing. Subclasses 379 * may provide additional information. 380 * 381 * @throws IOException If there is any error writing to the stream. 382 */ 383 protected void startArticle() throws IOException 384 { 385 startArticle(true); 386 } 387 388 /** 389 * Start a new article, which is typically defined as a column 390 * on a single page (also referred to as a bead). 391 * Default implementation is to do nothing. Subclasses 392 * may provide additional information. 393 * 394 * @param isltr true if primary direction of text is left to right. 395 * @throws IOException If there is any error writing to the stream. 396 */ 397 protected void startArticle(boolean isltr) throws IOException 398 { 399 //default is to do nothing. 400 } 401 402 /** 403 * End an article. Default implementation is to do nothing. Subclasses 404 * may provide additional information. 405 * 406 * @throws IOException If there is any error writing to the stream. 407 */ 408 protected void endArticle() throws IOException 409 { 410 //default is to do nothing 411 } 412 413 /** 414 * Start a new page. Default implementation is to do nothing. Subclasses 415 * may provide additional information. 416 * 417 * @param page The page we are about to process. 418 * 419 * @throws IOException If there is any error writing to the stream. 420 */ 421 protected void startPage( PDPage page ) throws IOException 422 { 423 //default is to do nothing. 424 } 425 426 /** 427 * End a page. Default implementation is to do nothing. Subclasses 428 * may provide additional information. 429 * 430 * @param page The page we are about to process. 431 * 432 * @throws IOException If there is any error writing to the stream. 433 */ 434 protected void endPage( PDPage page ) throws IOException 435 { 436 //default is to do nothing 437 } 438 439 440 /** 441 * This will print the text of the processed page to "output". 442 * It will estimate, based on the coordinates of the text, where 443 * newlines and word spacings should be placed. The text will be 444 * sorted only if that feature was enabled. 445 * 446 * @throws IOException If there is an error writing the text. 447 */ 448 protected void writePage() throws IOException 449 { 450 float maxYForLine = -1; 451 float minYTopForLine = Float.MAX_VALUE; 452 float endOfLastTextX = -1; 453 float lastWordSpacing = -1; 454 float maxHeightForLine = -1; 455 TextPosition lastPosition = null; 456 457 for( int i = 0; i < charactersByArticle.size(); i++) 458 { 459 List textList = (List)charactersByArticle.get( i ); 460 if( sortByPosition ) 461 { 462 TextPositionComparator comparator = new TextPositionComparator(); 463 Collections.sort( textList, comparator ); 464 } 465 466 Iterator textIter = textList.iterator(); 467 468 /* Before we can display the text, we need to do some normalizing. 469 * Arabic and Hebrew text is right to left and is typically stored 470 * in its logical format, which means that the rightmost character is 471 * stored first, followed by the second character from the right etc. 472 * However, PDF stores the text in presentation form, which is left to 473 * right. We need to do some normalization to convert the PDF data to 474 * the proper logical output format. 475 * 476 * Note that if we did not sort the text, then the output of reversing the 477 * text is undefined and can sometimes produce worse output then not trying 478 * to reverse the order. Sorting should be done for these languages. 479 * */ 480 481 /* First step is to determine if we have any right to left text, and 482 * if so, is it dominant. */ 483 int ltrCnt = 0; 484 int rtlCnt = 0; 485 486 while( textIter.hasNext() ) 487 { 488 TextPosition position = (TextPosition)textIter.next(); 489 String stringValue = position.getCharacter(); 490 491 for (int a = 0; a < stringValue.length(); a++) 492 { 493 byte dir = Character.getDirectionality(stringValue.charAt(a)); 494 if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ) || 495 (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING) || 496 (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE )) 497 { 498 ltrCnt++; 499 } 500 else if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT ) || 501 (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC) || 502 (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING) || 503 (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE )) 504 { 505 rtlCnt++; 506 } 507 } 508 } 509 510 // choose the dominant direction 511 boolean isRtlDominant = false; 512 if (rtlCnt > ltrCnt) 513 { 514 isRtlDominant = true; 515 } 516 517 startArticle(!isRtlDominant); 518 519 // we will later use this to skip reordering 520 boolean hasRtl = false; 521 if (rtlCnt > 0) 522 { 523 hasRtl = true; 524 } 525 526 /* Now cycle through to print the text. 527 * We queue up a line at a time before we print so that we can convert 528 * the line from presentation form to logical form (if needed). */ 529 String lineStr = ""; 530 531 textIter = textList.iterator(); // start from the beginning again 532 533 /* PDF files don't always store spaces. We will need to guess where we should add 534 * spaces based on the distances between TextPositions. Historically, this was done 535 * based on the size of the space character provided by the font. In general, this worked 536 * but there were cases where it did not work. Calculating the average character width 537 * and using that as a metric works better in some cases but fails in some cases where the 538 * spacing worked. So we use both. NOTE: Adobe reader also fails on some of these examples. 539 */ 540 541 //Keeps track of the previous average character width 542 float previousAveCharWidth = -1; 543 while( textIter.hasNext() ) 544 { 545 TextPosition position = (TextPosition)textIter.next(); 546 String characterValue = position.getCharacter(); 547 548 //Resets the average character width when we see a change in font 549 // or a change in the font size 550 if(lastPosition != null && ((position.getFont() != lastPosition.getFont()) 551 || (position.getFontSize() != lastPosition.getFontSize()))) 552 { 553 previousAveCharWidth = -1; 554 } 555 556 float positionX; 557 float positionY; 558 float positionWidth; 559 float positionHeight; 560 561 /* If we are sorting, then we need to use the text direction 562 * adjusted coordinates, because they were used in the sorting. */ 563 if (sortByPosition) 564 { 565 positionX = position.getXDirAdj(); 566 positionY = position.getYDirAdj(); 567 positionWidth = position.getWidthDirAdj(); 568 positionHeight = position.getHeightDir(); 569 } 570 else 571 { 572 positionX = position.getX(); 573 positionY = position.getY(); 574 positionWidth = position.getWidth(); 575 positionHeight = position.getHeight(); 576 } 577 578 //The current amount of characters in a word 579 int wordCharCount = position.getIndividualWidths().length; 580 581 /* Estimate the expected width of the space based on the 582 * space character with some margin. */ 583 float wordSpacing = position.getWidthOfSpace(); 584 float deltaSpace = 0; 585 if ((wordSpacing == 0) || (wordSpacing == Float.NaN)) 586 { 587 deltaSpace = Float.MAX_VALUE; 588 } 589 else 590 { 591 if( lastWordSpacing < 0 ) 592 { 593 deltaSpace = (wordSpacing * spacingTolerance); 594 } 595 else 596 { 597 deltaSpace = (((wordSpacing+lastWordSpacing)/2f)* spacingTolerance); 598 } 599 } 600 601 /* Estimate the expected width of the space based on the 602 * average character width with some margin. This calculation does not 603 * make a true average (average of averages) but we found that it gave the 604 * best results after numerous experiments. Based on experiments we also found that 605 * .3 worked well. */ 606 float averageCharWidth = -1; 607 if(previousAveCharWidth < 0) 608 { 609 averageCharWidth = (positionWidth/wordCharCount); 610 } 611 else 612 { 613 averageCharWidth = (previousAveCharWidth + (positionWidth/wordCharCount))/2f; 614 } 615 float deltaCharWidth = (averageCharWidth * averageCharTolerance); 616 617 //Compares the values obtained by the average method and the wordSpacing method and picks 618 //the smaller number. 619 float expectedStartOfNextWordX = -1; 620 if(endOfLastTextX != -1) 621 { 622 if(deltaCharWidth > deltaSpace) 623 { 624 expectedStartOfNextWordX = endOfLastTextX + deltaSpace; 625 } 626 else 627 { 628 expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth; 629 } 630 } 631 632 if( lastPosition != null ) 633 { 634 // RDD - Here we determine whether this text object is on the current 635 // line. We use the lastBaselineFontSize to handle the superscript 636 // case, and the size of the current font to handle the subscript case. 637 // Text must overlap with the last rendered baseline text by at least 638 // a small amount in order to be considered as being on the same line. 639 640 /* XXX BC: In theory, this check should really check if the next char is in full range 641 * seen in this line. This is what I tried to do with minYTopForLine, but this caused a lot 642 * of regression test failures. So, I'm leaving it be for now. */ 643 if(!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) 644 { 645 // If we have RTL text on the page, change the direction 646 if (hasRtl) 647 { 648 lineStr = normalize.makeLineLogicalOrder(lineStr, isRtlDominant); 649 } 650 651 /* normalize string to remove presentation forms. 652 * Note that this must come after the line direction 653 * conversion because the process looks ahead to the next 654 * logical character. 655 */ 656 lineStr = normalize.normalizePres(lineStr); 657 658 writeString(lineStr); 659 lineStr = ""; 660 661 writeLineSeparator( ); 662 663 endOfLastTextX = -1; 664 expectedStartOfNextWordX = -1; 665 maxYForLine = -1; 666 maxHeightForLine = -1; 667 minYTopForLine = Float.MAX_VALUE; 668 } 669 670 //Test if our TextPosition starts after a new word would be expected to start. 671 if (expectedStartOfNextWordX != -1 && expectedStartOfNextWordX < positionX && 672 //only bother adding a space if the last character was not a space 673 lastPosition.getCharacter() != null && 674 !lastPosition.getCharacter().endsWith( " " ) ) 675 { 676 lineStr += getWordSeparator(); 677 } 678 } 679 680 if (positionY >= maxYForLine) 681 { 682 maxYForLine = positionY; 683 } 684 685 // RDD - endX is what PDF considers to be the x coordinate of the 686 // end position of the text. We use it in computing our metrics below. 687 endOfLastTextX = positionX + positionWidth; 688 689 // add it to the list 690 if (characterValue != null) 691 { 692 lineStr += characterValue; 693 } 694 maxHeightForLine = Math.max( maxHeightForLine, positionHeight ); 695 minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight); 696 lastPosition = position; 697 lastWordSpacing = wordSpacing; 698 previousAveCharWidth = averageCharWidth; 699 } 700 701 // print the final line 702 if (lineStr.length() > 0) 703 { 704 if (hasRtl) 705 { 706 lineStr = normalize.makeLineLogicalOrder(lineStr, isRtlDominant); 707 } 708 709 // normalize string to remove presentation forms 710 lineStr = normalize.normalizePres(lineStr); 711 712 writeString(lineStr); 713 } 714 715 endArticle(); 716 } 717 writePageSeperator(); 718 } 719 720 private boolean overlap( float y1, float height1, float y2, float height2 ) 721 { 722 return within( y1, y2, .1f) || (y2 <= y1 && y2 >= y1-height1) || 723 (y1 <= y2 && y1 >= y2-height2); 724 } 725 726 /** 727 * Write the page separator value to the output stream. 728 * @throws IOException 729 * If there is a problem writing out the pageseparator to the document. 730 */ 731 protected void writePageSeperator() throws IOException 732 { 733 // RDD - newline at end of flush - required for end of page (so that the top 734 // of the next page starts on its own line. 735 // 736 output.write(getPageSeparator()); 737 output.flush(); 738 } 739 740 /** 741 * Write the line separator value to the output stream. 742 * @throws IOException 743 * If there is a problem writing out the lineseparator to the document. 744 */ 745 protected void writeLineSeparator( ) throws IOException 746 { 747 output.write(getLineSeparator()); 748 } 749 750 751 /** 752 * Write the word separator value to the output stream. 753 * @throws IOException 754 * If there is a problem writing out the wordseparator to the document. 755 */ 756 protected void writeWordSeparator() throws IOException 757 { 758 output.write(getWordSeparator()); 759 } 760 761 /** 762 * Write the string in TextPosition to the output stream. 763 * 764 * @param text The text to write to the stream. 765 * @throws IOException If there is an error when writing the text. 766 */ 767 protected void writeCharacters( TextPosition text ) throws IOException 768 { 769 output.write( text.getCharacter() ); 770 } 771 772 /** 773 * Write a Java string to the output stream. 774 * 775 * @param text The text to write to the stream. 776 * @throws IOException If there is an error when writing the text. 777 */ 778 protected void writeString( String text ) throws IOException 779 { 780 output.write( text ); 781 } 782 783 /** 784 * This will determine of two floating point numbers are within a specified variance. 785 * 786 * @param first The first number to compare to. 787 * @param second The second number to compare to. 788 * @param variance The allowed variance. 789 */ 790 private boolean within( float first, float second, float variance ) 791 { 792 return second < first + variance && second > first - variance; 793 } 794 795 796 /** 797 * This will process a TextPosition object and add the 798 * text to the list of characters on a page. It takes care of 799 * overlapping text. 800 * 801 * @param text The text to process. 802 */ 803 protected void processTextPosition( TextPosition text ) 804 { 805 boolean showCharacter = true; 806 if( suppressDuplicateOverlappingText ) 807 { 808 showCharacter = false; 809 String textCharacter = text.getCharacter(); 810 float textX = text.getX(); 811 float textY = text.getY(); 812 List sameTextCharacters = (List)characterListMapping.get( textCharacter ); 813 if( sameTextCharacters == null ) 814 { 815 sameTextCharacters = new ArrayList(); 816 characterListMapping.put( textCharacter, sameTextCharacters ); 817 } 818 819 // RDD - Here we compute the value that represents the end of the rendered 820 // text. This value is used to determine whether subsequent text rendered 821 // on the same line overwrites the current text. 822 // 823 // We subtract any positive padding to handle cases where extreme amounts 824 // of padding are applied, then backed off (not sure why this is done, but there 825 // are cases where the padding is on the order of 10x the character width, and 826 // the TJ just backs up to compensate after each character). Also, we subtract 827 // an amount to allow for kerning (a percentage of the width of the last 828 // character). 829 // 830 boolean suppressCharacter = false; 831 float tolerance = (text.getWidth()/textCharacter.length())/3.0f; 832 for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ ) 833 { 834 TextPosition character = (TextPosition)sameTextCharacters.get( i ); 835 String charCharacter = character.getCharacter(); 836 float charX = character.getX(); 837 float charY = character.getY(); 838 //only want to suppress 839 840 if( charCharacter != null && 841 //charCharacter.equals( textCharacter ) && 842 within( charX, textX, tolerance ) && 843 within( charY, 844 textY, 845 tolerance ) ) 846 { 847 suppressCharacter = true; 848 } 849 } 850 if( !suppressCharacter ) 851 { 852 sameTextCharacters.add( text ); 853 showCharacter = true; 854 } 855 } 856 857 if( showCharacter ) 858 { 859 //if we are showing the character then we need to determine which 860 //article it belongs to. 861 int foundArticleDivisionIndex = -1; 862 int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; 863 int notFoundButFirstLeftArticleDivisionIndex = -1; 864 int notFoundButFirstAboveArticleDivisionIndex = -1; 865 float x = text.getX(); 866 float y = text.getY(); 867 if( shouldSeparateByBeads ) 868 { 869 for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ ) 870 { 871 PDThreadBead bead = (PDThreadBead)pageArticles.get( i ); 872 if( bead != null ) 873 { 874 PDRectangle rect = bead.getRectangle(); 875 if( rect.contains( x, y ) ) 876 { 877 foundArticleDivisionIndex = i*2+1; 878 } 879 else if( (x < rect.getLowerLeftX() || 880 y < rect.getUpperRightY()) && 881 notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) 882 { 883 notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2; 884 } 885 else if( x < rect.getLowerLeftX() && 886 notFoundButFirstLeftArticleDivisionIndex == -1) 887 { 888 notFoundButFirstLeftArticleDivisionIndex = i*2; 889 } 890 else if( y < rect.getUpperRightY() && 891 notFoundButFirstAboveArticleDivisionIndex == -1) 892 { 893 notFoundButFirstAboveArticleDivisionIndex = i*2; 894 } 895 } 896 else 897 { 898 foundArticleDivisionIndex = 0; 899 } 900 } 901 } 902 else 903 { 904 foundArticleDivisionIndex = 0; 905 } 906 int articleDivisionIndex = -1; 907 if( foundArticleDivisionIndex != -1 ) 908 { 909 articleDivisionIndex = foundArticleDivisionIndex; 910 } 911 else if( notFoundButFirstLeftAndAboveArticleDivisionIndex != -1 ) 912 { 913 articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; 914 } 915 else if( notFoundButFirstLeftArticleDivisionIndex != -1 ) 916 { 917 articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; 918 } 919 else if( notFoundButFirstAboveArticleDivisionIndex != -1 ) 920 { 921 articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; 922 } 923 else 924 { 925 articleDivisionIndex = charactersByArticle.size()-1; 926 } 927 928 List textList = (List) charactersByArticle.get( articleDivisionIndex ); 929 930 /* In the wild, some PDF encoded documents put diacritics (accents on 931 * top of characters) into a separate Tj element. When displaying them 932 * graphically, the two chunks get overlayed. With text output though, 933 * we need to do the overlay. This code recombines the diacritic with 934 * its associated character if the two are consecutive. 935 */ 936 if(textList.isEmpty()) 937 { 938 textList.add(text); 939 } 940 else 941 { 942 /* test if we overlap the previous entry. 943 * Note that we are making an assumption that we need to only look back 944 * one TextPosition to find what we are overlapping. 945 * This may not always be true. */ 946 TextPosition previousTextPosition = (TextPosition)textList.get(textList.size()-1); 947 if(text.isDiacritic() && previousTextPosition.contains(text)) 948 { 949 previousTextPosition.mergeDiacritic(text, normalize); 950 } 951 /* If the previous TextPosition was the diacritic, merge it into this 952 * one and remove it from the list. */ 953 else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) 954 { 955 text.mergeDiacritic(previousTextPosition, normalize); 956 textList.remove(textList.size()-1); 957 textList.add(text); 958 } 959 else 960 { 961 textList.add(text); 962 } 963 } 964 } 965 } 966 967 /** 968 * This is the page that the text extraction will start on. The pages start 969 * at page 1. For example in a 5 page PDF document, if the start page is 1 970 * then all pages will be extracted. If the start page is 4 then pages 4 and 5 971 * will be extracted. The default value is 1. 972 * 973 * @return Value of property startPage. 974 */ 975 public int getStartPage() 976 { 977 return startPage; 978 } 979 980 /** 981 * This will set the first page to be extracted by this class. 982 * 983 * @param startPageValue New value of property startPage. 984 */ 985 public void setStartPage(int startPageValue) 986 { 987 startPage = startPageValue; 988 } 989 990 /** 991 * This will get the last page that will be extracted. This is inclusive, 992 * for example if a 5 page PDF an endPage value of 5 would extract the 993 * entire document, an end page of 2 would extract pages 1 and 2. This defaults 994 * to Integer.MAX_VALUE such that all pages of the pdf will be extracted. 995 * 996 * @return Value of property endPage. 997 */ 998 public int getEndPage() 999 { 1000 return endPage; 1001 } 1002 1003 /** 1004 * This will set the last page to be extracted by this class. 1005 * 1006 * @param endPageValue New value of property endPage. 1007 */ 1008 public void setEndPage(int endPageValue) 1009 { 1010 endPage = endPageValue; 1011 } 1012 1013 /** 1014 * Set the desired line separator for output text. The line.separator 1015 * system property is used if the line separator preference is not set 1016 * explicitly using this method. 1017 * 1018 * @param separator The desired line separator string. 1019 */ 1020 public void setLineSeparator(String separator) 1021 { 1022 lineSeparator = separator; 1023 } 1024 1025 /** 1026 * This will get the line separator. 1027 * 1028 * @return The desired line separator string. 1029 */ 1030 public String getLineSeparator() 1031 { 1032 return lineSeparator; 1033 } 1034 1035 /** 1036 * Set the desired page separator for output text. The line.separator 1037 * system property is used if the page separator preference is not set 1038 * explicitly using this method. 1039 * 1040 * @param separator The desired page separator string. 1041 */ 1042 public void setPageSeparator(String separator) 1043 { 1044 pageSeparator = separator; 1045 } 1046 1047 /** 1048 * This will get the word separator. 1049 * 1050 * @return The desired word separator string. 1051 */ 1052 public String getWordSeparator() 1053 { 1054 return wordSeparator; 1055 } 1056 1057 /** 1058 * Set the desired word separator for output text. The PDFBox text extraction 1059 * algorithm will output a space character if there is enough space between 1060 * two words. By default a space character is used. If you need and accurate 1061 * count of characters that are found in a PDF document then you might want to 1062 * set the word separator to the empty string. 1063 * 1064 * @param separator The desired page separator string. 1065 */ 1066 public void setWordSeparator(String separator) 1067 { 1068 wordSeparator = separator; 1069 } 1070 1071 /** 1072 * This will get the page separator. 1073 * 1074 * @return The page separator string. 1075 */ 1076 public String getPageSeparator() 1077 { 1078 return pageSeparator; 1079 } 1080 /** 1081 * @return Returns the suppressDuplicateOverlappingText. 1082 */ 1083 public boolean shouldSuppressDuplicateOverlappingText() 1084 { 1085 return suppressDuplicateOverlappingText; 1086 } 1087 1088 /** 1089 * Get the current page number that is being processed. 1090 * 1091 * @return A 1 based number representing the current page. 1092 */ 1093 protected int getCurrentPageNo() 1094 { 1095 return currentPageNo; 1096 } 1097 1098 /** 1099 * The output stream that is being written to. 1100 * 1101 * @return The stream that output is being written to. 1102 */ 1103 protected Writer getOutput() 1104 { 1105 return output; 1106 } 1107 1108 /** 1109 * Character strings are grouped by articles. It is quite common that there 1110 * will only be a single article. This returns a List that contains List objects, 1111 * the inner lists will contain TextPosition objects. 1112 * 1113 * @return A double List of TextPositions for all text strings on the page. 1114 */ 1115 protected List getCharactersByArticle() 1116 { 1117 return charactersByArticle; 1118 } 1119 1120 /** 1121 * By default the text stripper will attempt to remove text that overlapps each other. 1122 * Word paints the same character several times in order to make it look bold. By setting 1123 * this to false all text will be extracted, which means that certain sections will be 1124 * duplicated, but better performance will be noticed. 1125 * 1126 * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set. 1127 */ 1128 public void setSuppressDuplicateOverlappingText( 1129 boolean suppressDuplicateOverlappingTextValue) 1130 { 1131 this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue; 1132 } 1133 1134 /** 1135 * This will tell if the text stripper should separate by beads. 1136 * 1137 * @return If the text will be grouped by beads. 1138 */ 1139 public boolean shouldSeparateByBeads() 1140 { 1141 return shouldSeparateByBeads; 1142 } 1143 1144 /** 1145 * Set if the text stripper should group the text output by a list of beads. The default value is true! 1146 * 1147 * @param aShouldSeparateByBeads The new grouping of beads. 1148 */ 1149 public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) 1150 { 1151 this.shouldSeparateByBeads = aShouldSeparateByBeads; 1152 } 1153 1154 /** 1155 * Get the bookmark where text extraction should end, inclusive. Default is null. 1156 * 1157 * @return The ending bookmark. 1158 */ 1159 public PDOutlineItem getEndBookmark() 1160 { 1161 return endBookmark; 1162 } 1163 1164 /** 1165 * Set the bookmark where the text extraction should stop. 1166 * 1167 * @param aEndBookmark The ending bookmark. 1168 */ 1169 public void setEndBookmark(PDOutlineItem aEndBookmark) 1170 { 1171 endBookmark = aEndBookmark; 1172 } 1173 1174 /** 1175 * Get the bookmark where text extraction should start, inclusive. Default is null. 1176 * 1177 * @return The starting bookmark. 1178 */ 1179 public PDOutlineItem getStartBookmark() 1180 { 1181 return startBookmark; 1182 } 1183 1184 /** 1185 * Set the bookmark where text extraction should start, inclusive. 1186 * 1187 * @param aStartBookmark The starting bookmark. 1188 */ 1189 public void setStartBookmark(PDOutlineItem aStartBookmark) 1190 { 1191 startBookmark = aStartBookmark; 1192 } 1193 1194 /** 1195 * This will tell if the text stripper should sort the text tokens 1196 * before writing to the stream. 1197 * 1198 * @return true If the text tokens will be sorted before being written. 1199 */ 1200 public boolean shouldSortByPosition() 1201 { 1202 return sortByPosition; 1203 } 1204 1205 /** 1206 * The order of the text tokens in a PDF file may not be in the same 1207 * as they appear visually on the screen. For example, a PDF writer may 1208 * write out all text by font, so all bold or larger text, then make a second 1209 * pass and write out the normal text.<br/> 1210 * The default is to <b>not</b> sort by position.<br/> 1211 * <br/> 1212 * A PDF writer could choose to write each character in a different order. By 1213 * default PDFBox does <b>not</b> sort the text tokens before processing them due to 1214 * performance reasons. 1215 * 1216 * @param newSortByPosition Tell PDFBox to sort the text positions. 1217 */ 1218 public void setSortByPosition(boolean newSortByPosition) 1219 { 1220 sortByPosition = newSortByPosition; 1221 } 1222 1223 /** 1224 * Get the current space width-based tolerance value that is being used 1225 * to estimate where spaces in text should be added. Note that the 1226 * default value for this has been determined from trial and error. 1227 * 1228 * @return The current tolerance / scaling factor 1229 */ 1230 public float getSpacingTolerance() 1231 { 1232 return spacingTolerance; 1233 } 1234 1235 /** 1236 * Set the space width-based tolerance value that is used 1237 * to estimate where spaces in text should be added. Note that the 1238 * default value for this has been determined from trial and error. 1239 * Setting this value larger will reduce the number of spaces added. 1240 * 1241 * @param spacingToleranceValue tolerance / scaling factor to use 1242 */ 1243 public void setSpacingTolerance(float spacingToleranceValue) 1244 { 1245 this.spacingTolerance = spacingToleranceValue; 1246 } 1247 1248 /** 1249 * Get the current character width-based tolerance value that is being used 1250 * to estimate where spaces in text should be added. Note that the 1251 * default value for this has been determined from trial and error. 1252 * 1253 * @return The current tolerance / scaling factor 1254 */ 1255 public float getAverageCharTolerance() 1256 { 1257 return averageCharTolerance; 1258 } 1259 1260 /** 1261 * Set the character width-based tolerance value that is used 1262 * to estimate where spaces in text should be added. Note that the 1263 * default value for this has been determined from trial and error. 1264 * Setting this value larger will reduce the number of spaces added. 1265 * 1266 * @param averageCharToleranceValue average tolerance / scaling factor to use 1267 */ 1268 public void setAverageCharTolerance(float averageCharToleranceValue) 1269 { 1270 this.averageCharTolerance = averageCharToleranceValue; 1271 } 1272 }