From c68ad0ec056b37c82debebcecfcde1866d61b4d9 Mon Sep 17 00:00:00 2001 From: tknall Date: Tue, 25 Nov 2008 12:03:13 +0000 Subject: Removing pdfbox from source. git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@301 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- src/main/java/org/pdfbox/ExtractText.java | 270 ------------------------------ 1 file changed, 270 deletions(-) delete mode 100644 src/main/java/org/pdfbox/ExtractText.java (limited to 'src/main/java/org/pdfbox/ExtractText.java') diff --git a/src/main/java/org/pdfbox/ExtractText.java b/src/main/java/org/pdfbox/ExtractText.java deleted file mode 100644 index 5f5a328..0000000 --- a/src/main/java/org/pdfbox/ExtractText.java +++ /dev/null @@ -1,270 +0,0 @@ -/** - * Copyright (c) 2003-2004, www.pdfbox.org - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * 3. Neither the name of pdfbox; nor the names of its - * contributors may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * http://www.pdfbox.org - * - */ -package org.pdfbox; - -import java.io.FileOutputStream; -import java.io.OutputStreamWriter; -import java.io.Writer; - -import org.pdfbox.exceptions.InvalidPasswordException; - -import org.pdfbox.pdmodel.PDDocument; - -import org.pdfbox.util.PDFText2HTML; -import org.pdfbox.util.PDFTextStripper; - -import org.apache.log4j.Logger; - -/** - * This is the main program that simply parses the pdf document and transforms it - * into text. - * - * @author Ben Litchfield (ben@csh.rit.edu) - * @version $Revision: 1.9 $ - */ -public class ExtractText -{ - private static final Logger LOG = Logger.getLogger( ExtractText.class ); - - /** - * This is the default encoding of the text to be output. - */ - public static final String DEFAULT_ENCODING = - null; - //"ISO-8859-1"; - //"ISO-8859-6"; //arabic - //"US-ASCII"; - //"UTF-8"; - //"UTF-16"; - //"UTF-16BE"; - //"UTF-16LE"; - - - private static final String PASSWORD = "-password"; - private static final String ENCODING = "-encoding"; - private static final String CONSOLE = "-console"; - private static final String START_PAGE = "-startPage"; - private static final String END_PAGE = "-endPage"; - private static final String SORT = "-sort"; - private static final String HTML = "-html"; // jjb - added simple HTML output - - /** - * private constructor. - */ - private ExtractText() - { - //static class - } - - /** - * Infamous main method. - * - * @param args Command line arguments, should be one and a reference to a file. - * - * @throws Exception If there is an error parsing the document. - */ - public static void main( String[] args ) throws Exception - { - boolean toConsole = false; - boolean toHTML = false; - boolean sort = false; - String password = ""; - String encoding = DEFAULT_ENCODING; - String pdfFile = null; - String textFile = null; - int startPage = 1; - int endPage = Integer.MAX_VALUE; - for( int i=0; i= args.length ) - { - usage(); - } - password = args[i]; - } - else if( args[i].equals( ENCODING ) ) - { - i++; - if( i >= args.length ) - { - usage(); - } - encoding = args[i]; - } - else if( args[i].equals( START_PAGE ) ) - { - i++; - if( i >= args.length ) - { - usage(); - } - startPage = Integer.parseInt( args[i] ); - } - else if( args[i].equals( HTML ) ) - { - toHTML = true; - } - else if( args[i].equals( SORT ) ) - { - sort = true; - } - else if( args[i].equals( END_PAGE ) ) - { - i++; - if( i >= args.length ) - { - usage(); - } - endPage = Integer.parseInt( args[i] ); - } - else if( args[i].equals( CONSOLE ) ) - { - toConsole = true; - } - else - { - if( pdfFile == null ) - { - pdfFile = args[i]; - } - else - { - textFile = args[i]; - } - } - } - - if( pdfFile == null ) - { - usage(); - } - - if( textFile == null && pdfFile.length() >4 ) - { - textFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ".txt"; - } - - Writer output = null; - PDDocument document = null; - try - { - document = PDDocument.load( pdfFile ); - - //document.print(); - if( document.isEncrypted() ) - { - try - { - document.decrypt( password ); - } - catch( InvalidPasswordException e ) - { - if( args.length == 4 )//they supplied the wrong password - { - System.err.println( "Error: The supplied password is incorrect." ); - System.exit( 2 ); - } - else - { - //they didn't suppply a password and the default of "" was wrong. - System.err.println( "Error: The document is encrypted." ); - usage(); - } - } - } - if( toConsole ) - { - output = new OutputStreamWriter( System.out ); - } - else - { - if( encoding != null ) - { - output = new OutputStreamWriter( - new FileOutputStream( textFile ), encoding ); - } - else - { - //use default encoding - output = new OutputStreamWriter( - new FileOutputStream( textFile ) ); - } - } - - PDFTextStripper stripper = null; - if(toHTML) - { - stripper = new PDFText2HTML(); - } - else - { - stripper = new PDFTextStripper(); - } - stripper.setSortByPosition( sort ); - stripper.setStartPage( startPage ); - stripper.setEndPage( endPage ); - stripper.writeText( document, output ); - } - finally - { - if( output != null ) - { - output.close(); - } - if( document != null ) - { - document.close(); - } - } - } - - /** - * This will print the usage requirements and exit. - */ - private static void usage() - { - System.err.println( "Usage: java org.pdfbox.ExtractText [OPTIONS] [Text File]\n" + - " -password Password to decrypt document\n" + - " -encoding (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" + - " -console Send text to console instead of file\n" + - " -html Output in HTML format instead of raw text\n" + - " -sort Sort the text before writing\n" + - " -startPage The first page to start extraction(1 based)\n" + - " -endPage The last page to extract(inclusive)\n" + - " The PDF document to use\n" + - " [Text File] The file to write the text to\n" - ); - System.exit( 1 ); - } -} \ No newline at end of file -- cgit v1.2.3