1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.pdfbox.examples.util; 18 19 import org.apache.pdfbox.pdfparser.PDFStreamParser; 20 import org.apache.pdfbox.pdfwriter.ContentStreamWriter; 21 22 import org.apache.pdfbox.pdmodel.PDDocument; 23 import org.apache.pdfbox.pdmodel.PDPage; 24 import org.apache.pdfbox.pdmodel.common.PDStream; 25 import org.apache.pdfbox.util.PDFOperator; 26 27 import java.util.ArrayList; 28 import java.util.List; 29 30 /** 31 * This is an example on how to remove all text from PDF document. 32 * 33 * Usage: java org.apache.pdfbox.examples.util.RemoveAllText <input-pdf> <output-pdf> 34 * 35 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> 36 * @version $Revision: 1.2 $ 37 */ 38 public class RemoveAllText 39 { 40 /** 41 * Default constructor. 42 */ 43 private RemoveAllText() 44 { 45 //example class should not be instantiated 46 } 47 48 /** 49 * This will remove all text from a PDF document. 50 * 51 * @param args The command line arguments. 52 * 53 * @throws Exception If there is an error parsing the document. 54 */ 55 public static void main( String[] args ) throws Exception 56 { 57 if( args.length != 2 ) 58 { 59 usage(); 60 } 61 else 62 { 63 PDDocument document = null; 64 try 65 { 66 document = PDDocument.load( args[0] ); 67 if( document.isEncrypted() ) 68 { 69 System.err.println( "Error: Encrypted documents are not supported for this example." ); 70 System.exit( 1 ); 71 } 72 List allPages = document.getDocumentCatalog().getAllPages(); 73 for( int i=0; i<allPages.size(); i++ ) 74 { 75 PDPage page = (PDPage)allPages.get( i ); 76 PDFStreamParser parser = new PDFStreamParser(page.getContents()); 77 parser.parse(); 78 List tokens = parser.getTokens(); 79 List newTokens = new ArrayList(); 80 for( int j=0; j<tokens.size(); j++) 81 { 82 Object token = tokens.get( j ); 83 if( token instanceof PDFOperator ) 84 { 85 PDFOperator op = (PDFOperator)token; 86 if( op.getOperation().equals( "TJ") || op.getOperation().equals( "Tj" )) 87 { 88 //remove the one argument to this operator 89 newTokens.remove( newTokens.size() -1 ); 90 continue; 91 } 92 } 93 newTokens.add( token ); 94 95 } 96 PDStream newContents = new PDStream( document ); 97 ContentStreamWriter writer = new ContentStreamWriter( newContents.createOutputStream() ); 98 writer.writeTokens( newTokens ); 99 newContents.addCompression(); 100 page.setContents( newContents ); 101 } 102 document.save( args[1] ); 103 } 104 finally 105 { 106 if( document != null ) 107 { 108 document.close(); 109 } 110 } 111 } 112 } 113 114 /** 115 * This will print the usage for this document. 116 */ 117 private static void usage() 118 { 119 System.err.println( "Usage: java org.apache.pdfbox.examples.pdmodel.RemoveAllText <input-pdf> <output-pdf>" ); 120 } 121 122 }