aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/org/pdfbox/ExtractText.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/org/pdfbox/ExtractText.java')
-rw-r--r--src/main/java/org/pdfbox/ExtractText.java270
1 files changed, 270 insertions, 0 deletions
diff --git a/src/main/java/org/pdfbox/ExtractText.java b/src/main/java/org/pdfbox/ExtractText.java
new file mode 100644
index 0000000..5f5a328
--- /dev/null
+++ b/src/main/java/org/pdfbox/ExtractText.java
@@ -0,0 +1,270 @@
+/**
+ * Copyright (c) 2003-2004, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox;
+
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
+import org.pdfbox.exceptions.InvalidPasswordException;
+
+import org.pdfbox.pdmodel.PDDocument;
+
+import org.pdfbox.util.PDFText2HTML;
+import org.pdfbox.util.PDFTextStripper;
+
+import org.apache.log4j.Logger;
+
+/**
+ * This is the main program that simply parses the pdf document and transforms it
+ * into text.
+ *
+ * @author Ben Litchfield (ben@csh.rit.edu)
+ * @version $Revision: 1.9 $
+ */
+public class ExtractText
+{
+ private static final Logger LOG = Logger.getLogger( ExtractText.class );
+
+ /**
+ * This is the default encoding of the text to be output.
+ */
+ public static final String DEFAULT_ENCODING =
+ null;
+ //"ISO-8859-1";
+ //"ISO-8859-6"; //arabic
+ //"US-ASCII";
+ //"UTF-8";
+ //"UTF-16";
+ //"UTF-16BE";
+ //"UTF-16LE";
+
+
+ private static final String PASSWORD = "-password";
+ private static final String ENCODING = "-encoding";
+ private static final String CONSOLE = "-console";
+ private static final String START_PAGE = "-startPage";
+ private static final String END_PAGE = "-endPage";
+ private static final String SORT = "-sort";
+ private static final String HTML = "-html"; // jjb - added simple HTML output
+
+ /**
+ * private constructor.
+ */
+ private ExtractText()
+ {
+ //static class
+ }
+
+ /**
+ * Infamous main method.
+ *
+ * @param args Command line arguments, should be one and a reference to a file.
+ *
+ * @throws Exception If there is an error parsing the document.
+ */
+ public static void main( String[] args ) throws Exception
+ {
+ boolean toConsole = false;
+ boolean toHTML = false;
+ boolean sort = false;
+ String password = "";
+ String encoding = DEFAULT_ENCODING;
+ String pdfFile = null;
+ String textFile = null;
+ int startPage = 1;
+ int endPage = Integer.MAX_VALUE;
+ for( int i=0; i<args.length; i++ )
+ {
+ if( args[i].equals( PASSWORD ) )
+ {
+ i++;
+ if( i >= args.length )
+ {
+ usage();
+ }
+ password = args[i];
+ }
+ else if( args[i].equals( ENCODING ) )
+ {
+ i++;
+ if( i >= args.length )
+ {
+ usage();
+ }
+ encoding = args[i];
+ }
+ else if( args[i].equals( START_PAGE ) )
+ {
+ i++;
+ if( i >= args.length )
+ {
+ usage();
+ }
+ startPage = Integer.parseInt( args[i] );
+ }
+ else if( args[i].equals( HTML ) )
+ {
+ toHTML = true;
+ }
+ else if( args[i].equals( SORT ) )
+ {
+ sort = true;
+ }
+ else if( args[i].equals( END_PAGE ) )
+ {
+ i++;
+ if( i >= args.length )
+ {
+ usage();
+ }
+ endPage = Integer.parseInt( args[i] );
+ }
+ else if( args[i].equals( CONSOLE ) )
+ {
+ toConsole = true;
+ }
+ else
+ {
+ if( pdfFile == null )
+ {
+ pdfFile = args[i];
+ }
+ else
+ {
+ textFile = args[i];
+ }
+ }
+ }
+
+ if( pdfFile == null )
+ {
+ usage();
+ }
+
+ if( textFile == null && pdfFile.length() >4 )
+ {
+ textFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ".txt";
+ }
+
+ Writer output = null;
+ PDDocument document = null;
+ try
+ {
+ document = PDDocument.load( pdfFile );
+
+ //document.print();
+ if( document.isEncrypted() )
+ {
+ try
+ {
+ document.decrypt( password );
+ }
+ catch( InvalidPasswordException e )
+ {
+ if( args.length == 4 )//they supplied the wrong password
+ {
+ System.err.println( "Error: The supplied password is incorrect." );
+ System.exit( 2 );
+ }
+ else
+ {
+ //they didn't suppply a password and the default of "" was wrong.
+ System.err.println( "Error: The document is encrypted." );
+ usage();
+ }
+ }
+ }
+ if( toConsole )
+ {
+ output = new OutputStreamWriter( System.out );
+ }
+ else
+ {
+ if( encoding != null )
+ {
+ output = new OutputStreamWriter(
+ new FileOutputStream( textFile ), encoding );
+ }
+ else
+ {
+ //use default encoding
+ output = new OutputStreamWriter(
+ new FileOutputStream( textFile ) );
+ }
+ }
+
+ PDFTextStripper stripper = null;
+ if(toHTML)
+ {
+ stripper = new PDFText2HTML();
+ }
+ else
+ {
+ stripper = new PDFTextStripper();
+ }
+ stripper.setSortByPosition( sort );
+ stripper.setStartPage( startPage );
+ stripper.setEndPage( endPage );
+ stripper.writeText( document, output );
+ }
+ finally
+ {
+ if( output != null )
+ {
+ output.close();
+ }
+ if( document != null )
+ {
+ document.close();
+ }
+ }
+ }
+
+ /**
+ * This will print the usage requirements and exit.
+ */
+ private static void usage()
+ {
+ System.err.println( "Usage: java org.pdfbox.ExtractText [OPTIONS] <PDF file> [Text File]\n" +
+ " -password <password> Password to decrypt document\n" +
+ " -encoding <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" +
+ " -console Send text to console instead of file\n" +
+ " -html Output in HTML format instead of raw text\n" +
+ " -sort Sort the text before writing\n" +
+ " -startPage <number> The first page to start extraction(1 based)\n" +
+ " -endPage <number> The last page to extract(inclusive)\n" +
+ " <PDF file> The PDF document to use\n" +
+ " [Text File] The file to write the text to\n"
+ );
+ System.exit( 1 );
+ }
+} \ No newline at end of file