1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.pdfbox.pdfparser; 18 19 import java.io.File; 20 import java.io.InputStream; 21 import java.io.IOException; 22 23 import java.util.ArrayList; 24 import java.util.Iterator; 25 import java.util.List; 26 import java.util.regex.Pattern; 27 28 import org.apache.commons.logging.Log; 29 import org.apache.commons.logging.LogFactory; 30 import org.apache.pdfbox.cos.COSBase; 31 import org.apache.pdfbox.cos.COSDictionary; 32 import org.apache.pdfbox.cos.COSDocument; 33 import org.apache.pdfbox.cos.COSInteger; 34 import org.apache.pdfbox.cos.COSObject; 35 import org.apache.pdfbox.exceptions.WrappedIOException; 36 import org.apache.pdfbox.io.RandomAccess; 37 38 import org.apache.pdfbox.pdmodel.PDDocument; 39 40 import org.apache.pdfbox.pdmodel.fdf.FDFDocument; 41 42 import org.apache.pdfbox.persistence.util.COSObjectKey; 43 44 /** 45 * This class will handle the parsing of the PDF document. 46 * 47 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> 48 * @version $Revision: 1.53 $ 49 */ 50 public class PDFParser extends BaseParser 51 { 52 53 /** 54 * Log instance. 55 */ 56 private static final Log log = LogFactory.getLog(PDFParser.class); 57 58 private static final int SPACE_BYTE = 32; 59 60 private static final String PDF_HEADER = "%PDF-"; 61 private static final String FDF_HEADER = "%FDF-"; 62 private boolean forceParsing = false; 63 64 /** 65 * A list of duplicate objects found when Parsing the PDF 66 * File. 67 */ 68 private List conflictList = new ArrayList(); 69 70 /** 71 * Temp file directory. 72 */ 73 private File tempDirectory = null; 74 75 private RandomAccess raf = null; 76 77 /** 78 * Constructor. 79 * 80 * @param input The input stream that contains the PDF document. 81 * 82 * @throws IOException If there is an error initializing the stream. 83 */ 84 public PDFParser( InputStream input ) throws IOException 85 { 86 this(input, null); 87 } 88 89 /** 90 * Constructor to allow control over RandomAccessFile. 91 * @param input The input stream that contains the PDF document. 92 * @param rafi The RandomAccessFile to be used in internal COSDocument 93 * 94 * @throws IOException If there is an error initializing the stream. 95 */ 96 public PDFParser(InputStream input, RandomAccess rafi) 97 throws IOException 98 { 99 super(input); 100 this.raf = rafi; 101 } 102 103 /** 104 * Constructor to allow control over RandomAccessFile. 105 * Also enables parser to skip corrupt objects to try and force parsing 106 * @param input The input stream that contains the PDF document. 107 * @param rafi The RandomAccessFile to be used in internal COSDocument 108 * @param force When true, the parser will skip corrupt pdf objects and 109 * will continue parsing at the next object in the file 110 * 111 * @throws IOException If there is an error initializing the stream. 112 */ 113 public PDFParser(InputStream input, RandomAccess rafi, boolean force) 114 throws IOException 115 { 116 super(input); 117 this.raf = rafi; 118 this.forceParsing = force; 119 } 120 121 /** 122 * This is the directory where pdfbox will create a temporary file 123 * for storing pdf document stream in. By default this directory will 124 * be the value of the system property java.io.tmpdir. 125 * 126 * @param tmpDir The directory to create scratch files needed to store 127 * pdf document streams. 128 */ 129 public void setTempDirectory( File tmpDir ) 130 { 131 tempDirectory = tmpDir; 132 } 133 134 /** 135 * This will parse the stream and populate the COSDocument object. This will close 136 * the stream when it is done parsing. 137 * 138 * @throws IOException If there is an error reading from the stream or corrupt data 139 * is found. 140 */ 141 public void parse() throws IOException 142 { 143 try 144 { 145 if ( raf == null ) 146 { 147 if( tempDirectory != null ) 148 { 149 document = new COSDocument( tempDirectory ); 150 } 151 else 152 { 153 document = new COSDocument(); 154 } 155 } 156 else 157 { 158 document = new COSDocument( raf ); 159 } 160 setDocument( document ); 161 162 parseHeader(); 163 164 //Some PDF files have garbage between the header and the 165 //first object 166 skipToNextObj(); 167 168 boolean wasLastParsedObjectEOF = false; 169 try 170 { 171 while(true) 172 { 173 if(pdfSource.isEOF()) 174 { 175 break; 176 } 177 try 178 { 179 wasLastParsedObjectEOF = parseObject(); 180 } 181 catch(IOException e) 182 { 183 if(forceParsing) 184 { 185 /* 186 * Warning is sent to the PDFBox.log and to the Console that 187 * we skipped over an object 188 */ 189 log.warn("Parsing Error, Skipping Object", e); 190 skipToNextObj(); 191 } 192 else 193 { 194 throw e; 195 } 196 } 197 skipSpaces(); 198 } 199 //Test if we saw a trailer section. If not, look for an XRef Stream (Cross-Reference Stream) 200 //to populate the trailer and xref information. For PDF 1.5 and above 201 if( document.getTrailer() == null ) 202 { 203 document.parseXrefStreams(); 204 } 205 if( !document.isEncrypted() ) 206 { 207 document.dereferenceObjectStreams(); 208 } 209 ConflictObj.resolveConflicts(document, conflictList); 210 } 211 catch( IOException e ) 212 { 213 /* 214 * PDF files may have random data after the EOF marker. Ignore errors if 215 * last object processed is EOF. 216 */ 217 if( !wasLastParsedObjectEOF ) 218 { 219 throw e; 220 } 221 } 222 } 223 catch( Throwable t ) 224 { 225 //so if the PDF is corrupt then close the document and clear 226 //all resources to it 227 if( document != null ) 228 { 229 document.close(); 230 } 231 if( t instanceof IOException ) 232 { 233 throw (IOException)t; 234 } 235 else 236 { 237 throw new WrappedIOException( t ); 238 } 239 } 240 finally 241 { 242 pdfSource.close(); 243 } 244 } 245 246 /** 247 * Skip to the start of the next object. This is used to recover 248 * from a corrupt object. This should handle all cases that parseObject 249 * supports. This assumes that the next object will 250 * start on its own line. 251 * 252 * @throws IOException 253 */ 254 private void skipToNextObj() throws IOException 255 { 256 byte[] b = new byte[16]; 257 Pattern p = Pattern.compile("\\d+\\s+\\d+\\s+obj.*", Pattern.DOTALL); 258 /* Read a buffer of data each time to see if it starts with a 259 * known keyword. This is not the most efficient design, but we should 260 * rarely be needing this function. We could update this to use the 261 * circular buffer, like in readUntilEndStream(). 262 */ 263 while(!pdfSource.isEOF()) 264 { 265 int l = pdfSource.read(b); 266 if(l < 1) 267 { 268 break; 269 } 270 String s = new String(b, "US-ASCII"); 271 if(s.startsWith("trailer") || 272 s.startsWith("xref") || 273 s.startsWith("startxref") || 274 s.startsWith("stream") || 275 p.matcher(s).matches()) 276 { 277 pdfSource.unread(b); 278 break; 279 } 280 else 281 { 282 pdfSource.unread(b, 1, l-1); 283 } 284 } 285 } 286 287 private void parseHeader() throws IOException 288 { 289 // read first line 290 String header = readLine(); 291 // some pdf-documents are broken and the pdf-version is in one of the following lines 292 if ((header.indexOf( PDF_HEADER ) == -1) && (header.indexOf( FDF_HEADER ) == -1)) 293 { 294 header = readLine(); 295 while ((header.indexOf( PDF_HEADER ) == -1) && (header.indexOf( FDF_HEADER ) == -1)) 296 { 297 // if a line starts with a digit, it has to be the first one with data in it 298 if ((header.length() > 0) && (Character.isDigit(header.charAt(0)))) 299 { 300 break; 301 } 302 header = readLine(); 303 } 304 } 305 306 // nothing found 307 if ((header.indexOf( PDF_HEADER ) == -1) && (header.indexOf( FDF_HEADER ) == -1)) 308 { 309 throw new IOException( "Error: Header doesn't contain versioninfo" ); 310 } 311 312 //sometimes there are some garbage bytes in the header before the header 313 //actually starts, so lets try to find the header first. 314 int headerStart = header.indexOf( PDF_HEADER ); 315 if (headerStart == -1) 316 { 317 headerStart = header.indexOf(FDF_HEADER); 318 } 319 320 //greater than zero because if it is zero then 321 //there is no point of trimming 322 if ( headerStart > 0 ) 323 { 324 //trim off any leading characters 325 header = header.substring( headerStart, header.length() ); 326 } 327 328 /* 329 * This is used if there is garbage after the header on the same line 330 */ 331 if (header.startsWith(PDF_HEADER)) 332 { 333 if(!header.matches(PDF_HEADER + "\\d.\\d")) 334 { 335 String headerGarbage = header.substring(PDF_HEADER.length()+3, header.length()) + "\n"; 336 header = header.substring(0, PDF_HEADER.length()+3); 337 pdfSource.unread(headerGarbage.getBytes()); 338 } 339 } 340 else 341 { 342 if(!header.matches(FDF_HEADER + "\\d.\\d")) 343 { 344 String headerGarbage = header.substring(FDF_HEADER.length()+3, header.length()) + "\n"; 345 header = header.substring(0, FDF_HEADER.length()+3); 346 pdfSource.unread(headerGarbage.getBytes()); 347 } 348 } 349 document.setHeaderString(header); 350 351 try 352 { 353 if (header.startsWith( PDF_HEADER )) 354 { 355 float pdfVersion = Float. parseFloat( 356 header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER .length()+3) ) ); 357 document.setVersion( pdfVersion ); 358 } 359 else 360 { 361 float pdfVersion = Float. parseFloat( 362 header.substring( FDF_HEADER.length(), Math.min( header.length(), FDF_HEADER.length()+3) ) ); 363 document.setVersion( pdfVersion ); 364 } 365 } 366 catch ( NumberFormatException e ) 367 { 368 throw new IOException( "Error getting pdf version:" + e ); 369 } 370 } 371 372 /** 373 * This will get the document that was parsed. parse() must be called before this is called. 374 * When you are done with this document you must call close() on it to release 375 * resources. 376 * 377 * @return The document that was parsed. 378 * 379 * @throws IOException If there is an error getting the document. 380 */ 381 public COSDocument getDocument() throws IOException 382 { 383 if( document == null ) 384 { 385 throw new IOException( "You must call parse() before calling getDocument()" ); 386 } 387 return document; 388 } 389 390 /** 391 * This will get the PD document that was parsed. When you are done with 392 * this document you must call close() on it to release resources. 393 * 394 * @return The document at the PD layer. 395 * 396 * @throws IOException If there is an error getting the document. 397 */ 398 public PDDocument getPDDocument() throws IOException 399 { 400 return new PDDocument( getDocument() ); 401 } 402 403 /** 404 * This will get the FDF document that was parsed. When you are done with 405 * this document you must call close() on it to release resources. 406 * 407 * @return The document at the PD layer. 408 * 409 * @throws IOException If there is an error getting the document. 410 */ 411 public FDFDocument getFDFDocument() throws IOException 412 { 413 return new FDFDocument( getDocument() ); 414 } 415 416 /** 417 * This will parse the next object from the stream and add it to 418 * the local state. 419 * 420 * @return Returns true if the processed object had an endOfFile marker 421 * 422 * @throws IOException If an IO error occurs. 423 */ 424 private boolean parseObject() throws IOException 425 { 426 int currentObjByteOffset = pdfSource.getOffset(); 427 boolean isEndOfFile = false; 428 skipSpaces(); 429 //peek at the next character to determine the type of object we are parsing 430 char peekedChar = (char)pdfSource.peek(); 431 432 //ignore endobj and endstream sections. 433 while( peekedChar == 'e' ) 434 { 435 //there are times when there are multiple endobj, so lets 436 //just read them and move on. 437 readString(); 438 skipSpaces(); 439 peekedChar = (char)pdfSource.peek(); 440 } 441 if( pdfSource.isEOF()) 442 { 443 //"Skipping because of EOF" ); 444 //end of file we will return a false and call it a day. 445 } 446 //xref table. Note: The contents of the Xref table are currently ignored 447 else if( peekedChar == 'x') 448 { 449 parseXrefTable(); 450 } 451 // Note: startxref can occur in either a trailer section or by itself 452 else if (peekedChar == 't' || peekedChar == 's') 453 { 454 if(peekedChar == 't') 455 { 456 parseTrailer(); 457 peekedChar = (char)pdfSource.peek(); 458 } 459 if (peekedChar == 's') 460 { 461 parseStartXref(); 462 //verify that EOF exists 463 String eof = readExpectedString( "%%EOF" ); 464 if( eof.indexOf( "%%EOF" )== -1 && !pdfSource.isEOF() ) 465 { 466 throw new IOException( "expected='%%EOF' actual='" + eof + "' next=" + readString() + 467 " next=" +readString() ); 468 } 469 isEndOfFile = true; 470 } 471 } 472 //we are going to parse an normal object 473 else 474 { 475 int number = -1; 476 int genNum = -1; 477 String objectKey = null; 478 boolean missingObjectNumber = false; 479 try 480 { 481 char peeked = (char)pdfSource.peek(); 482 if( peeked == '<' ) 483 { 484 missingObjectNumber = true; 485 } 486 else 487 { 488 number = readInt(); 489 } 490 } 491 catch( IOException e ) 492 { 493 //ok for some reason "GNU Ghostscript 5.10" puts two endobj 494 //statements after an object, of course this is nonsense 495 //but because we want to support as many PDFs as possible 496 //we will simply try again 497 number = readInt(); 498 } 499 if( !missingObjectNumber ) 500 { 501 skipSpaces(); 502 genNum = readInt(); 503 504 objectKey = readString( 3 ); 505 //System.out.println( "parseObject() num=" + number + 506 //" genNumber=" + genNum + " key='" + objectKey + "'" ); 507 if( !objectKey.equals( "obj" ) ) 508 { 509 throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource); 510 } 511 } 512 else 513 { 514 number = -1; 515 genNum = -1; 516 } 517 518 skipSpaces(); 519 COSBase pb = parseDirObject(); 520 String endObjectKey = readString(); 521 522 if( endObjectKey.equals( "stream" ) ) 523 { 524 pdfSource.unread( endObjectKey.getBytes() ); 525 pdfSource.unread( ' ' ); 526 if( pb instanceof COSDictionary ) 527 { 528 pb = parseCOSStream( (COSDictionary)pb, getDocument().getScratchFile() ); 529 } 530 else 531 { 532 // this is not legal 533 // the combination of a dict and the stream/endstream forms a complete stream object 534 throw new IOException("stream not preceded by dictionary"); 535 } 536 endObjectKey = readString(); 537 } 538 539 COSObjectKey key = new COSObjectKey( number, genNum ); 540 COSObject pdfObject = document.getObjectFromPool( key ); 541 if(pdfObject.getObject() == null) 542 { 543 pdfObject.setObject(pb); 544 } 545 /* 546 * If the object we returned already has a baseobject, then we have a conflict 547 * which we will resolve using information after we parse the xref table. 548 */ 549 else 550 { 551 addObjectToConflicts(currentObjByteOffset, key, pb); 552 } 553 554 if( !endObjectKey.equals( "endobj" ) ) 555 { 556 if (endObjectKey.startsWith( "endobj" ) ) 557 { 558 /* 559 * Some PDF files don't contain a new line after endobj so we 560 * need to make sure that the next object number is getting read separately 561 * and not part of the endobj keyword. Ex. Some files would have "endobj28" 562 * instead of "endobj" 563 */ 564 pdfSource.unread( endObjectKey.substring( 6 ).getBytes() ); 565 } 566 else if( !pdfSource.isEOF() ) 567 { 568 try 569 { 570 //It is possible that the endobj is missing, there 571 //are several PDFs out there that do that so skip it and move on. 572 Float.parseFloat( endObjectKey ); 573 pdfSource.unread( SPACE_BYTE ); 574 pdfSource.unread( endObjectKey.getBytes() ); 575 } 576 catch( NumberFormatException e ) 577 { 578 //we will try again incase there was some garbage which 579 //some writers will leave behind. 580 String secondEndObjectKey = readString(); 581 if( !secondEndObjectKey.equals( "endobj" ) ) 582 { 583 if( isClosing() ) 584 { 585 //found a case with 17506.pdf object 41 that was like this 586 //41 0 obj [/Pattern /DeviceGray] ] endobj 587 //notice the second array close, here we are reading it 588 //and ignoring and attempting to continue 589 pdfSource.read(); 590 } 591 skipSpaces(); 592 String thirdPossibleEndObj = readString(); 593 if( !thirdPossibleEndObj.equals( "endobj" ) ) 594 { 595 throw new IOException("expected='endobj' firstReadAttempt='" + endObjectKey + "' " + 596 "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource); 597 } 598 } 599 } 600 } 601 } 602 skipSpaces(); 603 } 604 return isEndOfFile; 605 } 606 607 /** 608 * Adds a new ConflictObj to the conflictList. 609 * @param offset the offset of the ConflictObj 610 * @param key The COSObjectKey of this object 611 * @param pb The COSBase of this conflictObj 612 * @throws IOException 613 */ 614 private void addObjectToConflicts(int offset, COSObjectKey key, COSBase pb) throws IOException 615 { 616 COSObject obj = new COSObject(null); 617 obj.setObjectNumber( COSInteger.get( key.getNumber() ) ); 618 obj.setGenerationNumber( COSInteger.get( key.getGeneration() ) ); 619 obj.setObject(pb); 620 ConflictObj conflictObj = new ConflictObj(offset, key, obj); 621 conflictList.add(conflictObj); 622 } 623 624 /** 625 * This will parse the startxref section from the stream. 626 * The startxref value is ignored. 627 * 628 * @return false on parsing error 629 * @throws IOException If an IO error occurs. 630 */ 631 private boolean parseStartXref() throws IOException 632 { 633 if(pdfSource.peek() != 's') 634 { 635 return false; 636 } 637 String startXRef = readString(); 638 if( !startXRef.trim().equals( "startxref" ) ) 639 { 640 return false; 641 } 642 skipSpaces(); 643 /* This integer is the byte offset of the first object referenced by the xref or xref stream 644 * Not needed for PDFbox 645 */ 646 readInt(); 647 return true; 648 } 649 650 651 /** 652 * This will parse the xref table from the stream and add it to the state 653 * The XrefTable contents are ignored. 654 * 655 * @return false on parsing error 656 * @throws IOException If an IO error occurs. 657 */ 658 private boolean parseXrefTable() throws IOException 659 { 660 if(pdfSource.peek() != 'x') 661 { 662 return false; 663 } 664 String xref = readString(); 665 if( !xref.trim().equals( "xref" ) ) 666 { 667 return false; 668 } 669 /* 670 * Xref tables can have multiple sections. 671 * Each starts with a starting object id and a count. 672 */ 673 while(true) 674 { 675 int currObjID = readInt(); // first obj id 676 int count = readInt(); // the number of objects in the xref table 677 skipSpaces(); 678 for(int i = 0; i < count; i++) 679 { 680 if(pdfSource.isEOF() || isEndOfName((char)pdfSource.peek())) 681 { 682 break; 683 } 684 if(pdfSource.peek() == 't') 685 { 686 break; 687 } 688 //Ignore table contents 689 String currentLine = readLine(); 690 String[] splitString = currentLine.split(" "); 691 if (splitString.length < 3) 692 { 693 log.warn("invalid xref line: " + currentLine); 694 break; 695 } 696 /* This supports the corrupt table as reported in 697 * PDFBOX-474 (XXXX XXX XX n) */ 698 if(splitString[splitString.length-1].equals("n")) 699 { 700 try 701 { 702 int currOffset = Integer.parseInt(splitString[0]); 703 int currGenID = Integer.parseInt(splitString[1]); 704 COSObjectKey objKey = new COSObjectKey(currObjID, currGenID); 705 document.setXRef(objKey, currOffset); 706 } 707 catch(NumberFormatException e) 708 { 709 throw new IOException(e.getMessage()); 710 } 711 } 712 else if(!splitString[2].equals("f")) 713 { 714 throw new IOException("Corrupt XRefTable Entry - ObjID:" + currObjID); 715 } 716 currObjID++; 717 skipSpaces(); 718 } 719 skipSpaces(); 720 char c = (char)pdfSource.peek(); 721 if(c < '0' || c > '9') 722 { 723 break; 724 } 725 } 726 return true; 727 } 728 729 /** 730 * This will parse the trailer from the stream and add it to the state. 731 * 732 * @return false on parsing error 733 * @throws IOException If an IO error occurs. 734 */ 735 private boolean parseTrailer() throws IOException 736 { 737 if(pdfSource.peek() != 't') 738 { 739 return false; 740 } 741 //read "trailer" 742 String nextLine = readLine(); 743 if( !nextLine.trim().equals( "trailer" ) ) 744 { 745 // in some cases the EOL is missing and the trailer immediately 746 // continues with "<<" or with a blank character 747 // even if this does not comply with PDF reference we want to support as many PDFs as possible 748 // Acrobat reader can also deal with this. 749 if (nextLine.startsWith("trailer")) 750 { 751 byte[] b = nextLine.getBytes(); 752 int len = "trailer".length(); 753 pdfSource.unread('\n'); 754 pdfSource.unread(b, len, b.length-len); 755 } 756 else 757 { 758 return false; 759 } 760 } 761 762 // in some cases the EOL is missing and the trailer continues with " <<" 763 // even if this does not comply with PDF reference we want to support as many PDFs as possible 764 // Acrobat reader can also deal with this. 765 skipSpaces(); 766 767 COSDictionary parsedTrailer = parseCOSDictionary(); 768 COSDictionary docTrailer = document.getTrailer(); 769 if( docTrailer == null ) 770 { 771 document.setTrailer( parsedTrailer ); 772 } 773 else 774 { 775 docTrailer.addAll( parsedTrailer ); 776 } 777 skipSpaces(); 778 return true; 779 } 780 781 /** 782 * Used to resolve conflicts when a PDF Document has multiple objects with 783 * the same id number. Ideally, we could use the Xref table when parsing 784 * the document to be able to determine which of the objects with the same ID 785 * is correct, but we do not have access to the Xref Table during parsing. 786 * Instead, we queue up the conflicts and resolve them after the Xref has 787 * been parsed. The Objects listed in the Xref Table are kept and the 788 * others are ignored. 789 */ 790 private static class ConflictObj 791 { 792 793 private int offset; 794 private COSObjectKey objectKey; 795 private COSObject object; 796 797 public ConflictObj(int offsetValue, COSObjectKey key, COSObject pdfObject) 798 { 799 this.offset = offsetValue; 800 this.objectKey = key; 801 this.object = pdfObject; 802 } 803 public String toString() 804 { 805 return "Object(" + offset + ", " + objectKey + ")"; 806 } 807 808 /** 809 * Sometimes pdf files have objects with the same ID number yet are 810 * not referenced by the Xref table and therefore should be excluded. 811 * This method goes through the conflicts list and replaces the object stored 812 * in the objects array with this one if it is referenced by the xref 813 * table. 814 * @throws IOException 815 */ 816 private static void resolveConflicts(COSDocument document, List conflictList) throws IOException 817 { 818 Iterator conflicts = conflictList.iterator(); 819 while(conflicts.hasNext()) 820 { 821 ConflictObj o = (ConflictObj)conflicts.next(); 822 Integer offset = new Integer(o.offset); 823 if(document.getXrefTable().containsValue(offset)) 824 { 825 COSObject pdfObject = document.getObjectFromPool(o.objectKey); 826 pdfObject.setObject(o.object.getObject()); 827 } 828 } 829 } 830 } 831 }