1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.pdfbox.examples.util; 18 19 import org.apache.pdfbox.exceptions.InvalidPasswordException; 20 21 import org.apache.pdfbox.pdmodel.PDDocument; 22 import org.apache.pdfbox.pdmodel.PDPage; 23 import org.apache.pdfbox.util.PDFTextStripperByArea; 24 25 import java.awt.Rectangle; 26 27 import java.util.List; 28 29 /** 30 * This is an example on how to extract text from a specific area on the PDF document. 31 * 32 * Usage: java org.apache.pdfbox.examples.util.ExtractTextByArea <input-pdf> 33 * 34 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> 35 * @version $Revision: 1.2 $ 36 */ 37 public class ExtractTextByArea 38 { 39 private ExtractTextByArea() 40 { 41 //utility class and should not be constructed. 42 } 43 44 45 /** 46 * This will print the documents text in a certain area. 47 * 48 * @param args The command line arguments. 49 * 50 * @throws Exception If there is an error parsing the document. 51 */ 52 public static void main( String[] args ) throws Exception 53 { 54 if( args.length != 1 ) 55 { 56 usage(); 57 } 58 else 59 { 60 PDDocument document = null; 61 try 62 { 63 document = PDDocument.load( args[0] ); 64 if( document.isEncrypted() ) 65 { 66 try 67 { 68 document.decrypt( "" ); 69 } 70 catch( InvalidPasswordException e ) 71 { 72 System.err.println( "Error: Document is encrypted with a password." ); 73 System.exit( 1 ); 74 } 75 } 76 PDFTextStripperByArea stripper = new PDFTextStripperByArea(); 77 stripper.setSortByPosition( true ); 78 Rectangle rect = new Rectangle( 10, 280, 275, 60 ); 79 stripper.addRegion( "class1", rect ); 80 List allPages = document.getDocumentCatalog().getAllPages(); 81 PDPage firstPage = (PDPage)allPages.get( 0 ); 82 stripper.extractRegions( firstPage ); 83 System.out.println( "Text in the area:" + rect ); 84 System.out.println( stripper.getTextForRegion( "class1" ) ); 85 86 } 87 finally 88 { 89 if( document != null ) 90 { 91 document.close(); 92 } 93 } 94 } 95 } 96 97 /** 98 * This will print the usage for this document. 99 */ 100 private static void usage() 101 { 102 System.err.println( "Usage: java org.apache.pdfbox.examples.util.ExtractTextByArea <input-pdf>" ); 103 } 104 105 }