From 6025b6016517c6d898d8957d1d7e03ba71431912 Mon Sep 17 00:00:00 2001 From: tknall Date: Fri, 1 Dec 2006 12:20:24 +0000 Subject: Initial import of release 2.2. git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@4 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../pdfbox/examples/util/ExtractTextByArea.java | 119 +++++++++++++++++ .../pdfbox/examples/util/PrintTextLocations.java | 144 +++++++++++++++++++++ .../java/org/pdfbox/examples/util/package.html | 9 ++ 3 files changed, 272 insertions(+) create mode 100644 src/main/java/org/pdfbox/examples/util/ExtractTextByArea.java create mode 100644 src/main/java/org/pdfbox/examples/util/PrintTextLocations.java create mode 100644 src/main/java/org/pdfbox/examples/util/package.html (limited to 'src/main/java/org/pdfbox/examples/util') diff --git a/src/main/java/org/pdfbox/examples/util/ExtractTextByArea.java b/src/main/java/org/pdfbox/examples/util/ExtractTextByArea.java new file mode 100644 index 0000000..042e3e6 --- /dev/null +++ b/src/main/java/org/pdfbox/examples/util/ExtractTextByArea.java @@ -0,0 +1,119 @@ +/** + * Copyright (c) 2005, www.pdfbox.org + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. Neither the name of pdfbox; nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * http://www.pdfbox.org + * + */ +package org.pdfbox.examples.util; + +import org.pdfbox.exceptions.InvalidPasswordException; + +import org.pdfbox.pdmodel.PDDocument; +import org.pdfbox.pdmodel.PDPage; +import org.pdfbox.util.PDFTextStripperByArea; + +import java.awt.Rectangle; + +import java.util.List; + +/** + * This is an example on how to extract text from a specific area on the PDF document. + * + * Usage: java org.pdfbox.examples.util.ExtractTextByArea <input-pdf> + * + * @author Ben Litchfield (ben@benlitchfield.com) + * @version $Revision: 1.1 $ + */ +public class ExtractTextByArea +{ + private ExtractTextByArea() + { + //utility class and should not be constructed. + } + + + /** + * This will print the documents text in a certain area. + * + * @param args The command line arguments. + * + * @throws Exception If there is an error parsing the document. + */ + public static void main( String[] args ) throws Exception + { + if( args.length != 1 ) + { + usage(); + } + else + { + PDDocument document = null; + try + { + document = PDDocument.load( args[0] ); + if( document.isEncrypted() ) + { + try + { + document.decrypt( "" ); + } + catch( InvalidPasswordException e ) + { + System.err.println( "Error: Document is encrypted with a password." ); + System.exit( 1 ); + } + } + PDFTextStripperByArea stripper = new PDFTextStripperByArea(); + stripper.setSortByPosition( true ); + Rectangle rect = new Rectangle( 10, 280, 275, 60 ); + stripper.addRegion( "class1", rect ); + List allPages = document.getDocumentCatalog().getAllPages(); + PDPage firstPage = (PDPage)allPages.get( 0 ); + stripper.extractRegions( firstPage ); + System.out.println( "Text in the area:" + rect ); + System.out.println( stripper.getTextForRegion( "class1" ) ); + + } + finally + { + if( document != null ) + { + document.close(); + } + } + } + } + + /** + * This will print the usage for this document. + */ + private static void usage() + { + System.err.println( "Usage: java org.pdfbox.examples.util.ExtractTextByArea " ); + } + +} \ No newline at end of file diff --git a/src/main/java/org/pdfbox/examples/util/PrintTextLocations.java b/src/main/java/org/pdfbox/examples/util/PrintTextLocations.java new file mode 100644 index 0000000..6c83b57 --- /dev/null +++ b/src/main/java/org/pdfbox/examples/util/PrintTextLocations.java @@ -0,0 +1,144 @@ +/** + * Copyright (c) 2005, www.pdfbox.org + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. Neither the name of pdfbox; nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * http://www.pdfbox.org + * + */ +package org.pdfbox.examples.util; + +import org.pdfbox.exceptions.InvalidPasswordException; + +import org.pdfbox.pdfparser.PDFParser; + +import org.pdfbox.pdmodel.PDDocument; +import org.pdfbox.pdmodel.PDPage; +import org.pdfbox.util.PDFTextStripper; +import org.pdfbox.util.TextPosition; + +import java.io.FileInputStream; +import java.io.IOException; + +import java.util.List; + +/** + * This is an example on how to get some x/y coordinates of text. + * + * Usage: java org.pdfbox.examples.util.PrintTextLocations <input-pdf> + * + * @author Ben Litchfield (ben@benlitchfield.com) + * @version $Revision: 1.2 $ + */ +public class PrintTextLocations extends PDFTextStripper +{ + /** + * Default constructor. + * + * @throws IOException If there is an error loading text stripper properties. + */ + public PrintTextLocations() throws IOException + { + //default constructor. + } + + /** + * This will print the documents data. + * + * @param args The command line arguments. + * + * @throws Exception If there is an error parsing the document. + */ + public static void main( String[] args ) throws Exception + { + if( args.length != 1 ) + { + usage(); + } + else + { + PDDocument document = null; + FileInputStream file = null; + try + { + file = new FileInputStream( args[0] ); + PDFParser parser = new PDFParser( file ); + parser.parse(); + document = parser.getPDDocument(); + if( document.isEncrypted() ) + { + try + { + document.decrypt( "" ); + } + catch( InvalidPasswordException e ) + { + System.err.println( "Error: Document is encrypted with a password." ); + System.exit( 1 ); + } + } + PrintTextLocations printer = new PrintTextLocations(); + List allPages = document.getDocumentCatalog().getAllPages(); + for( int i=0; i" ); + } + +} \ No newline at end of file diff --git a/src/main/java/org/pdfbox/examples/util/package.html b/src/main/java/org/pdfbox/examples/util/package.html new file mode 100644 index 0000000..bc50f59 --- /dev/null +++ b/src/main/java/org/pdfbox/examples/util/package.html @@ -0,0 +1,9 @@ + + + + + + +The packages in this package will show how to use the PDFBox util API. + + -- cgit v1.2.3