Home » pdfbox-1.1.0-src » org.apache.pdfbox.pdfparser » [javadoc | source]

    1   /*
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *      http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   package org.apache.pdfbox.pdfparser;
   18   
   19   import java.io.File;
   20   import java.io.InputStream;
   21   import java.io.IOException;
   22   
   23   import java.util.ArrayList;
   24   import java.util.Iterator;
   25   import java.util.List;
   26   import java.util.regex.Pattern;
   27   
   28   import org.apache.commons.logging.Log;
   29   import org.apache.commons.logging.LogFactory;
   30   import org.apache.pdfbox.cos.COSBase;
   31   import org.apache.pdfbox.cos.COSDictionary;
   32   import org.apache.pdfbox.cos.COSDocument;
   33   import org.apache.pdfbox.cos.COSInteger;
   34   import org.apache.pdfbox.cos.COSObject;
   35   import org.apache.pdfbox.exceptions.WrappedIOException;
   36   import org.apache.pdfbox.io.RandomAccess;
   37   
   38   import org.apache.pdfbox.pdmodel.PDDocument;
   39   
   40   import org.apache.pdfbox.pdmodel.fdf.FDFDocument;
   41   
   42   import org.apache.pdfbox.persistence.util.COSObjectKey;
   43   
   44   /**
   45    * This class will handle the parsing of the PDF document.
   46    *
   47    * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
   48    * @version $Revision: 1.53 $
   49    */
   50   public class PDFParser extends BaseParser
   51   {
   52   
   53       /**
   54        * Log instance.
   55        */
   56       private static final Log log = LogFactory.getLog(PDFParser.class);
   57   
   58       private static final int SPACE_BYTE = 32;
   59   
   60       private static final String PDF_HEADER = "%PDF-";
   61       private static final String FDF_HEADER = "%FDF-";
   62       private boolean forceParsing = false; 
   63       
   64       /**
   65        * A list of duplicate objects found when Parsing the PDF
   66        * File. 
   67        */
   68       private List conflictList = new ArrayList();
   69      
   70       /**
   71        * Temp file directory.
   72        */
   73       private File tempDirectory = null;
   74   
   75       private RandomAccess raf = null;
   76   
   77       /**
   78        * Constructor.
   79        *
   80        * @param input The input stream that contains the PDF document.
   81        *
   82        * @throws IOException If there is an error initializing the stream.
   83        */
   84       public PDFParser( InputStream input ) throws IOException
   85       {
   86           this(input, null);
   87       }
   88   
   89       /**
   90        * Constructor to allow control over RandomAccessFile.
   91        * @param input The input stream that contains the PDF document.
   92        * @param rafi The RandomAccessFile to be used in internal COSDocument
   93        *
   94        * @throws IOException If there is an error initializing the stream.
   95        */
   96       public PDFParser(InputStream input, RandomAccess rafi)
   97           throws IOException
   98       {
   99           super(input);
  100           this.raf = rafi;
  101       }
  102       
  103       /**
  104        * Constructor to allow control over RandomAccessFile.
  105        * Also enables parser to skip corrupt objects to try and force parsing
  106        * @param input The input stream that contains the PDF document.
  107        * @param rafi The RandomAccessFile to be used in internal COSDocument
  108        * @param force When true, the parser will skip corrupt pdf objects and 
  109        * will continue parsing at the next object in the file
  110        *
  111        * @throws IOException If there is an error initializing the stream.
  112        */
  113       public PDFParser(InputStream input, RandomAccess rafi, boolean force)
  114           throws IOException
  115       {
  116           super(input);
  117           this.raf = rafi;
  118           this.forceParsing = force;
  119       }
  120   
  121       /**
  122        * This is the directory where pdfbox will create a temporary file
  123        * for storing pdf document stream in.  By default this directory will
  124        * be the value of the system property java.io.tmpdir.
  125        *
  126        * @param tmpDir The directory to create scratch files needed to store
  127        *        pdf document streams.
  128        */
  129       public void setTempDirectory( File tmpDir )
  130       {
  131           tempDirectory = tmpDir;
  132       }
  133   
  134       /**
  135        * This will parse the stream and populate the COSDocument object.  This will close
  136        * the stream when it is done parsing.
  137        *
  138        * @throws IOException If there is an error reading from the stream or corrupt data
  139        * is found.
  140        */
  141       public void parse() throws IOException
  142       {
  143           try
  144           {
  145               if ( raf == null )
  146               {
  147                   if( tempDirectory != null )
  148                   {
  149                       document = new COSDocument( tempDirectory );
  150                   }
  151                   else
  152                   {
  153                       document = new COSDocument();
  154                   }
  155               }
  156               else
  157               {
  158                   document = new COSDocument( raf );
  159               }
  160               setDocument( document );
  161   
  162               parseHeader();
  163               
  164               //Some PDF files have garbage between the header and the
  165               //first object
  166               skipToNextObj();
  167   
  168               boolean wasLastParsedObjectEOF = false;
  169               try
  170               {
  171                   while(true)
  172                   {
  173                       if(pdfSource.isEOF())
  174                       {
  175                           break;
  176                       }
  177                       try
  178                       {
  179                           wasLastParsedObjectEOF = parseObject();
  180                       }
  181                       catch(IOException e)
  182                       {
  183                           if(forceParsing)
  184                           {
  185                               /*
  186                                * Warning is sent to the PDFBox.log and to the Console that
  187                                * we skipped over an object
  188                                */
  189                               log.warn("Parsing Error, Skipping Object", e);
  190                               skipToNextObj();
  191                           }
  192                           else
  193                           { 
  194                               throw e;
  195                           }
  196                       }
  197                       skipSpaces();
  198                   }
  199                   //Test if we saw a trailer section. If not, look for an XRef Stream (Cross-Reference Stream) 
  200                   //to populate the trailer and xref information. For PDF 1.5 and above 
  201                   if( document.getTrailer() == null )
  202                   {
  203                       document.parseXrefStreams();
  204                   }
  205                   if( !document.isEncrypted() )
  206                   {
  207                       document.dereferenceObjectStreams();
  208                   }
  209                   ConflictObj.resolveConflicts(document, conflictList);     
  210               }
  211               catch( IOException e )
  212               {
  213                   /*
  214                    * PDF files may have random data after the EOF marker. Ignore errors if
  215                    * last object processed is EOF. 
  216                    */
  217                   if( !wasLastParsedObjectEOF )
  218                   {
  219                       throw e;
  220                   } 
  221               }
  222           }
  223           catch( Throwable t )
  224           {
  225               //so if the PDF is corrupt then close the document and clear
  226               //all resources to it
  227               if( document != null )
  228               {
  229                   document.close();
  230               }
  231               if( t instanceof IOException )
  232               {
  233                   throw (IOException)t;
  234               }
  235               else
  236               {
  237                   throw new WrappedIOException( t );
  238               }
  239           }
  240           finally
  241           {
  242               pdfSource.close();
  243           }
  244       }
  245       
  246       /**
  247        * Skip to the start of the next object.  This is used to recover
  248        * from a corrupt object. This should handle all cases that parseObject
  249        * supports. This assumes that the next object will
  250        * start on its own line.
  251        * 
  252        * @throws IOException 
  253        */
  254       private void skipToNextObj() throws IOException 
  255       {
  256           byte[] b = new byte[16];
  257           Pattern p = Pattern.compile("\\d+\\s+\\d+\\s+obj.*", Pattern.DOTALL);
  258           /* Read a buffer of data each time to see if it starts with a
  259            * known keyword. This is not the most efficient design, but we should
  260            * rarely be needing this function. We could update this to use the 
  261            * circular buffer, like in readUntilEndStream().
  262            */
  263           while(!pdfSource.isEOF())
  264           {
  265                int l = pdfSource.read(b);
  266                if(l < 1)
  267                {
  268                    break;
  269                }
  270                String s = new String(b, "US-ASCII");  
  271                if(s.startsWith("trailer") ||
  272                        s.startsWith("xref") || 
  273                        s.startsWith("startxref") ||
  274                        s.startsWith("stream") ||
  275                        p.matcher(s).matches())
  276                {
  277                    pdfSource.unread(b);
  278                    break;
  279                }
  280                else
  281                {
  282                    pdfSource.unread(b, 1, l-1);
  283                }
  284           }   
  285       }
  286   
  287       private void parseHeader() throws IOException
  288       {
  289           // read first line
  290           String header = readLine();
  291           // some pdf-documents are broken and the pdf-version is in one of the following lines
  292           if ((header.indexOf( PDF_HEADER ) == -1) && (header.indexOf( FDF_HEADER ) == -1))
  293           {
  294               header = readLine();
  295               while ((header.indexOf( PDF_HEADER ) == -1) && (header.indexOf( FDF_HEADER ) == -1))
  296               {
  297                   // if a line starts with a digit, it has to be the first one with data in it
  298                   if ((header.length() > 0) && (Character.isDigit(header.charAt(0))))
  299                   {
  300                       break;
  301                   }
  302                   header = readLine();
  303               }
  304           }
  305   
  306           // nothing found
  307           if ((header.indexOf( PDF_HEADER ) == -1) && (header.indexOf( FDF_HEADER ) == -1))
  308           {
  309               throw new IOException( "Error: Header doesn't contain versioninfo" );
  310           }
  311           
  312           //sometimes there are some garbage bytes in the header before the header
  313           //actually starts, so lets try to find the header first.
  314           int headerStart = header.indexOf( PDF_HEADER );
  315           if (headerStart == -1)
  316           {
  317               headerStart = header.indexOf(FDF_HEADER);
  318           }
  319   
  320           //greater than zero because if it is zero then
  321           //there is no point of trimming
  322           if ( headerStart > 0 )
  323           {
  324               //trim off any leading characters
  325               header = header.substring( headerStart, header.length() );
  326           }
  327   
  328           /*
  329            * This is used if there is garbage after the header on the same line
  330            */
  331           if (header.startsWith(PDF_HEADER)) 
  332           {
  333               if(!header.matches(PDF_HEADER + "\\d.\\d")) 
  334               {
  335                   String headerGarbage = header.substring(PDF_HEADER.length()+3, header.length()) + "\n";
  336                   header = header.substring(0, PDF_HEADER.length()+3);
  337                   pdfSource.unread(headerGarbage.getBytes());
  338               }
  339           }
  340           else 
  341           {
  342               if(!header.matches(FDF_HEADER + "\\d.\\d")) 
  343               {
  344                   String headerGarbage = header.substring(FDF_HEADER.length()+3, header.length()) + "\n";
  345                   header = header.substring(0, FDF_HEADER.length()+3);
  346                   pdfSource.unread(headerGarbage.getBytes());
  347               }
  348           }
  349           document.setHeaderString(header);
  350           
  351           try
  352           {
  353               if (header.startsWith( PDF_HEADER )) 
  354               {
  355                   float pdfVersion = Float. parseFloat(
  356                           header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER .length()+3) ) );
  357                   document.setVersion( pdfVersion );
  358               }
  359               else 
  360               {
  361                   float pdfVersion = Float. parseFloat(
  362                           header.substring( FDF_HEADER.length(), Math.min( header.length(), FDF_HEADER.length()+3) ) );
  363                   document.setVersion( pdfVersion );
  364               }
  365           }
  366           catch ( NumberFormatException e )
  367           {
  368               throw new IOException( "Error getting pdf version:" + e );
  369           } 
  370       } 
  371   
  372       /**
  373        * This will get the document that was parsed.  parse() must be called before this is called.
  374        * When you are done with this document you must call close() on it to release
  375        * resources.
  376        *
  377        * @return The document that was parsed.
  378        *
  379        * @throws IOException If there is an error getting the document.
  380        */
  381       public COSDocument getDocument() throws IOException
  382       {
  383           if( document == null )
  384           {
  385               throw new IOException( "You must call parse() before calling getDocument()" );
  386           }
  387           return document;
  388       }
  389   
  390       /**
  391        * This will get the PD document that was parsed.  When you are done with
  392        * this document you must call close() on it to release resources.
  393        *
  394        * @return The document at the PD layer.
  395        *
  396        * @throws IOException If there is an error getting the document.
  397        */
  398       public PDDocument getPDDocument() throws IOException
  399       {
  400           return new PDDocument( getDocument() );
  401       }
  402   
  403       /**
  404        * This will get the FDF document that was parsed.  When you are done with
  405        * this document you must call close() on it to release resources.
  406        *
  407        * @return The document at the PD layer.
  408        *
  409        * @throws IOException If there is an error getting the document.
  410        */
  411       public FDFDocument getFDFDocument() throws IOException
  412       {
  413           return new FDFDocument( getDocument() );
  414       }
  415   
  416       /**
  417        * This will parse the next object from the stream and add it to 
  418        * the local state. 
  419        *
  420        * @return Returns true if the processed object had an endOfFile marker
  421        *
  422        * @throws IOException If an IO error occurs.
  423        */
  424       private boolean parseObject() throws IOException
  425       {
  426           int currentObjByteOffset = pdfSource.getOffset();
  427           boolean isEndOfFile = false; 
  428           skipSpaces();
  429           //peek at the next character to determine the type of object we are parsing
  430           char peekedChar = (char)pdfSource.peek();
  431           
  432           //ignore endobj and endstream sections.
  433           while( peekedChar == 'e' )
  434           {
  435               //there are times when there are multiple endobj, so lets
  436               //just read them and move on.
  437               readString();
  438               skipSpaces();
  439               peekedChar = (char)pdfSource.peek();
  440           }
  441           if( pdfSource.isEOF())
  442           {
  443               //"Skipping because of EOF" );
  444               //end of file we will return a false and call it a day.
  445           }
  446           //xref table. Note: The contents of the Xref table are currently ignored
  447           else if( peekedChar == 'x') 
  448           {
  449               parseXrefTable();
  450           }
  451           // Note: startxref can occur in either a trailer section or by itself 
  452           else if (peekedChar == 't' || peekedChar == 's') 
  453           {
  454               if(peekedChar == 't')
  455               {
  456                   parseTrailer();
  457                   peekedChar = (char)pdfSource.peek(); 
  458               }
  459               if (peekedChar == 's')
  460               {  
  461                   parseStartXref();
  462                   //verify that EOF exists 
  463                   String eof = readExpectedString( "%%EOF" );
  464                   if( eof.indexOf( "%%EOF" )== -1 && !pdfSource.isEOF() )
  465                   {
  466                       throw new IOException( "expected='%%EOF' actual='" + eof + "' next=" + readString() +
  467                               " next=" +readString() );
  468                   }
  469                   isEndOfFile = true; 
  470               }
  471           }
  472           //we are going to parse an normal object
  473           else
  474           {
  475               int number = -1;
  476               int genNum = -1;
  477               String objectKey = null;
  478               boolean missingObjectNumber = false;
  479               try
  480               {
  481                   char peeked = (char)pdfSource.peek();
  482                   if( peeked == '<' )
  483                   {
  484                       missingObjectNumber = true;
  485                   }
  486                   else
  487                   {
  488                       number = readInt();
  489                   }
  490               }
  491               catch( IOException e )
  492               {
  493                   //ok for some reason "GNU Ghostscript 5.10" puts two endobj
  494                   //statements after an object, of course this is nonsense
  495                   //but because we want to support as many PDFs as possible
  496                   //we will simply try again
  497                   number = readInt();
  498               }
  499               if( !missingObjectNumber )
  500               {
  501                   skipSpaces();
  502                   genNum = readInt();
  503   
  504                   objectKey = readString( 3 );
  505                   //System.out.println( "parseObject() num=" + number +
  506                   //" genNumber=" + genNum + " key='" + objectKey + "'" );
  507                   if( !objectKey.equals( "obj" ) )
  508                   {
  509                       throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource);
  510                   }
  511               }
  512               else
  513               {
  514                   number = -1;
  515                   genNum = -1;
  516               }
  517   
  518               skipSpaces();
  519               COSBase pb = parseDirObject();
  520               String endObjectKey = readString();
  521               
  522               if( endObjectKey.equals( "stream" ) )
  523               {
  524                   pdfSource.unread( endObjectKey.getBytes() );
  525                   pdfSource.unread( ' ' );
  526                   if( pb instanceof COSDictionary )
  527                   {
  528                       pb = parseCOSStream( (COSDictionary)pb, getDocument().getScratchFile() );
  529                   }
  530                   else
  531                   {
  532                       // this is not legal
  533                       // the combination of a dict and the stream/endstream forms a complete stream object
  534                       throw new IOException("stream not preceded by dictionary");
  535                   }
  536                   endObjectKey = readString();
  537               }
  538               
  539               COSObjectKey key = new COSObjectKey( number, genNum );
  540               COSObject pdfObject = document.getObjectFromPool( key );
  541               if(pdfObject.getObject() == null)
  542               {
  543                   pdfObject.setObject(pb);
  544               }
  545               /*
  546                * If the object we returned already has a baseobject, then we have a conflict
  547                * which we will resolve using information after we parse the xref table.
  548                */
  549               else
  550               {
  551                   addObjectToConflicts(currentObjByteOffset, key, pb); 
  552               }
  553               
  554               if( !endObjectKey.equals( "endobj" ) )
  555               {
  556                                  if (endObjectKey.startsWith( "endobj" ) ) 
  557                                  {
  558                                          /*
  559                                            * Some PDF files don't contain a new line after endobj so we 
  560                                            * need to make sure that the next object number is getting read separately
  561                                            * and not part of the endobj keyword. Ex. Some files would have "endobj28"
  562                                            * instead of "endobj"
  563                                            */
  564                                           pdfSource.unread( endObjectKey.substring( 6 ).getBytes() );
  565                                       } 
  566                                       else if( !pdfSource.isEOF() )                
  567                                       {
  568                       try
  569                       {
  570                           //It is possible that the endobj  is missing, there
  571                           //are several PDFs out there that do that so skip it and move on.
  572                           Float.parseFloat( endObjectKey );
  573                           pdfSource.unread( SPACE_BYTE );
  574                           pdfSource.unread( endObjectKey.getBytes() );
  575                       }
  576                       catch( NumberFormatException e )
  577                       {
  578                           //we will try again incase there was some garbage which
  579                           //some writers will leave behind.
  580                           String secondEndObjectKey = readString();
  581                           if( !secondEndObjectKey.equals( "endobj" ) )
  582                           {
  583                               if( isClosing() )
  584                               {
  585                                   //found a case with 17506.pdf object 41 that was like this
  586                                   //41 0 obj [/Pattern /DeviceGray] ] endobj
  587                                   //notice the second array close, here we are reading it
  588                                   //and ignoring and attempting to continue
  589                                   pdfSource.read();
  590                               }
  591                               skipSpaces();
  592                               String thirdPossibleEndObj = readString();
  593                               if( !thirdPossibleEndObj.equals( "endobj" ) )
  594                               {
  595                                   throw new IOException("expected='endobj' firstReadAttempt='" + endObjectKey + "' " +
  596                                       "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource);
  597                               }
  598                           }
  599                       }
  600                   }
  601               }
  602               skipSpaces();
  603           }
  604           return isEndOfFile;
  605       }
  606       
  607      /**
  608       * Adds a new ConflictObj to the conflictList.
  609       * @param offset the offset of the ConflictObj
  610       * @param key The COSObjectKey of this object
  611       * @param pb The COSBase of this conflictObj
  612       * @throws IOException
  613       */
  614       private void addObjectToConflicts(int offset, COSObjectKey key, COSBase pb) throws IOException
  615       {
  616           COSObject obj = new COSObject(null);
  617           obj.setObjectNumber( COSInteger.get( key.getNumber() ) );
  618           obj.setGenerationNumber( COSInteger.get( key.getGeneration() ) );
  619           obj.setObject(pb);
  620           ConflictObj conflictObj = new ConflictObj(offset, key, obj);
  621           conflictList.add(conflictObj);   
  622       }
  623   
  624       /**
  625        * This will parse the startxref section from the stream.
  626        * The startxref value is ignored.
  627        *            
  628        * @return false on parsing error 
  629        * @throws IOException If an IO error occurs.
  630        */
  631       private boolean parseStartXref() throws IOException
  632       {
  633           if(pdfSource.peek() != 's')
  634           {
  635               return false; 
  636           }
  637           String startXRef = readString();
  638           if( !startXRef.trim().equals( "startxref" ) )
  639           {
  640               return false;
  641           }
  642           skipSpaces();
  643           /* This integer is the byte offset of the first object referenced by the xref or xref stream
  644            * Not needed for PDFbox
  645            */
  646           readInt();
  647           return true;
  648       }
  649   
  650   
  651       /**
  652        * This will parse the xref table from the stream and add it to the state
  653        * The XrefTable contents are ignored.
  654        *            
  655        * @return false on parsing error 
  656        * @throws IOException If an IO error occurs.
  657        */
  658       private boolean parseXrefTable() throws IOException
  659       {
  660           if(pdfSource.peek() != 'x')
  661           {
  662               return false;
  663           }
  664           String xref = readString();
  665           if( !xref.trim().equals( "xref" ) ) 
  666           {
  667               return false;
  668           }
  669           /*
  670            * Xref tables can have multiple sections. 
  671            * Each starts with a starting object id and a count.
  672            */
  673           while(true)
  674           {
  675               int currObjID = readInt(); // first obj id
  676               int count = readInt(); // the number of objects in the xref table
  677               skipSpaces();
  678               for(int i = 0; i < count; i++)
  679               {
  680                   if(pdfSource.isEOF() || isEndOfName((char)pdfSource.peek()))
  681                   {
  682                       break;
  683                   }
  684                   if(pdfSource.peek() == 't')
  685                   {
  686                       break;
  687                   }
  688                   //Ignore table contents
  689                   String currentLine = readLine();
  690                   String[] splitString = currentLine.split(" ");
  691                   if (splitString.length < 3)
  692                   {
  693                       log.warn("invalid xref line: " + currentLine);
  694                       break;
  695                   }
  696                   /* This supports the corrupt table as reported in 
  697                    * PDFBOX-474 (XXXX XXX XX n) */
  698                   if(splitString[splitString.length-1].equals("n"))
  699                   {
  700                       try
  701                       {
  702                           int currOffset = Integer.parseInt(splitString[0]);
  703                           int currGenID = Integer.parseInt(splitString[1]);
  704                           COSObjectKey objKey = new COSObjectKey(currObjID, currGenID);
  705                           document.setXRef(objKey, currOffset);
  706                       }
  707                       catch(NumberFormatException e)
  708                       {
  709                           throw new IOException(e.getMessage());
  710                       }
  711                   }
  712                   else if(!splitString[2].equals("f"))
  713                   {
  714                       throw new IOException("Corrupt XRefTable Entry - ObjID:" + currObjID);
  715                   }
  716                   currObjID++;
  717                   skipSpaces();
  718               }
  719               skipSpaces();
  720               char c = (char)pdfSource.peek();
  721               if(c < '0' || c > '9')
  722               {
  723                   break;
  724               }
  725           }
  726           return true;
  727       }
  728   
  729       /**
  730        * This will parse the trailer from the stream and add it to the state.
  731        *            
  732        * @return false on parsing error
  733        * @throws IOException If an IO error occurs.
  734        */
  735       private boolean parseTrailer() throws IOException
  736       {
  737           if(pdfSource.peek() != 't')
  738           {
  739               return false;
  740           }
  741           //read "trailer"
  742           String nextLine = readLine();
  743           if( !nextLine.trim().equals( "trailer" ) ) 
  744           {
  745               // in some cases the EOL is missing and the trailer immediately 
  746               // continues with "<<" or with a blank character
  747               // even if this does not comply with PDF reference we want to support as many PDFs as possible
  748               // Acrobat reader can also deal with this.
  749               if (nextLine.startsWith("trailer")) 
  750               {
  751                   byte[] b = nextLine.getBytes();
  752                   int len = "trailer".length();
  753                   pdfSource.unread('\n');
  754                   pdfSource.unread(b, len, b.length-len);
  755               }
  756               else 
  757               {
  758                   return false;
  759               }
  760           }
  761   
  762           // in some cases the EOL is missing and the trailer continues with " <<"
  763           // even if this does not comply with PDF reference we want to support as many PDFs as possible
  764           // Acrobat reader can also deal with this.
  765           skipSpaces();
  766   
  767           COSDictionary parsedTrailer = parseCOSDictionary();
  768           COSDictionary docTrailer = document.getTrailer();
  769           if( docTrailer == null )
  770           {
  771               document.setTrailer( parsedTrailer );
  772           }
  773           else
  774           {
  775               docTrailer.addAll( parsedTrailer );
  776           }
  777           skipSpaces();
  778           return true;
  779       }
  780       
  781       /**
  782        * Used to resolve conflicts when a PDF Document has multiple objects with
  783        * the same id number. Ideally, we could use the Xref table when parsing
  784        * the document to be able to determine which of the objects with the same ID
  785        * is correct, but we do not have access to the Xref Table during parsing.
  786        * Instead, we queue up the conflicts and resolve them after the Xref has
  787        * been parsed. The Objects listed in the Xref Table are kept and the 
  788        * others are ignored. 
  789        */
  790       private static class ConflictObj
  791       {
  792   
  793           private int offset;
  794           private COSObjectKey objectKey;
  795           private COSObject object;
  796           
  797           public ConflictObj(int offsetValue, COSObjectKey key, COSObject pdfObject) 
  798           {
  799               this.offset = offsetValue;
  800               this.objectKey = key;
  801               this.object = pdfObject;
  802           }
  803           public String toString()
  804           {
  805               return "Object(" + offset + ", " + objectKey + ")";
  806           }
  807           
  808           /**
  809            * Sometimes pdf files have objects with the same ID number yet are
  810            * not referenced by the Xref table and therefore should be excluded.             
  811            * This method goes through the conflicts list and replaces the object stored
  812            * in the objects array with this one if it is referenced by the xref
  813            * table. 
  814            * @throws IOException
  815            */
  816           private static void resolveConflicts(COSDocument document, List conflictList) throws IOException
  817           {
  818               Iterator conflicts = conflictList.iterator();
  819               while(conflicts.hasNext())
  820               {
  821                   ConflictObj o = (ConflictObj)conflicts.next();
  822                   Integer offset = new Integer(o.offset);
  823                   if(document.getXrefTable().containsValue(offset))
  824                   {
  825                       COSObject pdfObject = document.getObjectFromPool(o.objectKey);
  826                       pdfObject.setObject(o.object.getObject());
  827                   }
  828               }
  829           }
  830       }
  831   }

Home » pdfbox-1.1.0-src » org.apache.pdfbox.pdfparser » [javadoc | source]