/** * Copyright (c) 2005, www.pdfbox.org * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * 3. Neither the name of pdfbox; nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * http://www.pdfbox.org * */ package org.pdfbox.examples.util; import org.pdfbox.exceptions.InvalidPasswordException; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDPage; import org.pdfbox.util.PDFTextStripperByArea; import java.awt.Rectangle; import java.util.List; /** * This is an example on how to extract text from a specific area on the PDF document. * * Usage: java org.pdfbox.examples.util.ExtractTextByArea <input-pdf> * * @author Ben Litchfield (ben@benlitchfield.com) * @version $Revision: 1.1 $ */ public class ExtractTextByArea { private ExtractTextByArea() { //utility class and should not be constructed. } /** * This will print the documents text in a certain area. * * @param args The command line arguments. * * @throws Exception If there is an error parsing the document. */ public static void main( String[] args ) throws Exception { if( args.length != 1 ) { usage(); } else { PDDocument document = null; try { document = PDDocument.load( args[0] ); if( document.isEncrypted() ) { try { document.decrypt( "" ); } catch( InvalidPasswordException e ) { System.err.println( "Error: Document is encrypted with a password." ); System.exit( 1 ); } } PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition( true ); Rectangle rect = new Rectangle( 10, 280, 275, 60 ); stripper.addRegion( "class1", rect ); List allPages = document.getDocumentCatalog().getAllPages(); PDPage firstPage = (PDPage)allPages.get( 0 ); stripper.extractRegions( firstPage ); System.out.println( "Text in the area:" + rect ); System.out.println( stripper.getTextForRegion( "class1" ) ); } finally { if( document != null ) { document.close(); } } } } /** * This will print the usage for this document. */ private static void usage() { System.err.println( "Usage: java org.pdfbox.examples.util.ExtractTextByArea " ); } }