From c68ad0ec056b37c82debebcecfcde1866d61b4d9 Mon Sep 17 00:00:00 2001 From: tknall Date: Tue, 25 Nov 2008 12:03:13 +0000 Subject: Removing pdfbox from source. git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@301 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- src/main/java/org/pdfbox/pdfparser/PDFParser.java | 557 ---------------------- 1 file changed, 557 deletions(-) delete mode 100644 src/main/java/org/pdfbox/pdfparser/PDFParser.java (limited to 'src/main/java/org/pdfbox/pdfparser/PDFParser.java') diff --git a/src/main/java/org/pdfbox/pdfparser/PDFParser.java b/src/main/java/org/pdfbox/pdfparser/PDFParser.java deleted file mode 100644 index d655ef1..0000000 --- a/src/main/java/org/pdfbox/pdfparser/PDFParser.java +++ /dev/null @@ -1,557 +0,0 @@ -/** - * Copyright (c) 2003-2005, www.pdfbox.org - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * 3. Neither the name of pdfbox; nor the names of its - * contributors may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * http://www.pdfbox.org - * - */ -package org.pdfbox.pdfparser; - -import java.io.File; -import java.io.RandomAccessFile; -import java.io.InputStream; -import java.io.IOException; - -import java.util.Iterator; - -import org.pdfbox.cos.COSBase; -import org.pdfbox.cos.COSDictionary; -import org.pdfbox.cos.COSDocument; -import org.pdfbox.cos.COSObject; -import org.pdfbox.cos.COSStream; -import org.pdfbox.exceptions.WrappedIOException; - -import org.pdfbox.pdmodel.PDDocument; - -import org.pdfbox.pdmodel.fdf.FDFDocument; - -import org.pdfbox.persistence.util.COSObjectKey; - -import org.apache.log4j.Logger; - -/** - * This class will handle the parsing of the PDF document. - * - * @author Ben Litchfield (ben@benlitchfield.com) - * @version $Revision: 1.47 $ - */ -public class PDFParser extends BaseParser -{ - private static Logger log = Logger.getLogger( PDFParser.class ); - private static final int SPACE_BYTE = 32; - - private static final String PDF_HEADER = "%PDF-"; - private COSDocument document; - - /** - * Temp file directory. - */ - private File tempDirectory = new File( System.getProperty( "java.io.tmpdir" ) ); - - private RandomAccessFile raf = null; - - /** - * Constructor. - * - * @param input The input stream that contains the PDF document. - * - * @throws IOException If there is an error initializing the stream. - */ - public PDFParser( InputStream input ) throws IOException - { - this(input, null); - } - - /** - * Constructor to allow control over RandomAccessFile. - * @param input The input stream that contains the PDF document. - * @param rafi The RandomAccessFile to be used in internal COSDocument - * - * @throws IOException If there is an error initializing the stream. - */ - public PDFParser(InputStream input, RandomAccessFile rafi) - throws IOException - { - super(input); - this.raf = rafi; - } - - /** - * This is the directory where pdfbox will create a temporary file - * for storing pdf document stream in. By default this directory will - * be the value of the system property java.io.tmpdir. - * - * @param tmpDir The directory to create scratch files needed to store - * pdf document streams. - */ - public void setTempDirectory( File tmpDir ) - { - tempDirectory = tmpDir; - } - - /** - * This will prase the stream and create the PDF document. This will close - * the stream when it is done parsing. - * - * @throws IOException If there is an error reading from the stream. - */ - public void parse() throws IOException - { - try - { - if ( raf == null ) - { - document = new COSDocument( tempDirectory ); - } - else - { - document = new COSDocument( raf ); - } - setDocument( document ); - String header = readLine(); - if( log.isDebugEnabled() ) - { - log.debug( "Header=" + header ); - } - document.setHeaderString( header ); - - if( header.length() < PDF_HEADER.length()+1 ) - { - throw new IOException( "Error: Header is corrupt '" + header + "'" ); - } - - //sometimes there are some garbage bytes in the header before the header - //actually starts, so lets try to find the header first. - int headerStart = header.indexOf( PDF_HEADER ); - - //greater than zero because if it is zero then - //there is no point of trimming - if( headerStart > 0 ) - { - //trim off any leading characters - header = header.substring( headerStart, header.length() ); - } - - try - { - float pdfVersion = Float.parseFloat( - header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER.length()+3) ) ); - document.setVersion( pdfVersion ); - } - catch( NumberFormatException e ) - { - throw new IOException( "Error getting pdf version:" + e ); - } - - skipHeaderFillBytes(); - - - Object nextObject; - boolean wasLastParsedObjectAnXref = false; - try - { - while( (nextObject = parseObject()) != null ) - { - if( nextObject instanceof PDFXref ) - { - PDFXref xref = (PDFXref)nextObject; - addXref(xref); - wasLastParsedObjectAnXref = true; - } - else - { - wasLastParsedObjectAnXref = false; - } - skipSpaces(); - } - if( document.getTrailer() == null ) - { - COSDictionary trailer = new COSDictionary(); - Iterator xrefIter = document.getObjectsByType( "XRef" ).iterator(); - while( xrefIter.hasNext() ) - { - COSStream next = (COSStream)((COSObject)xrefIter.next()).getObject(); - trailer.addAll( next ); - } - document.setTrailer( trailer ); - } - if( !document.isEncrypted() ) - { - document.dereferenceObjectStreams(); - } - } - catch( IOException e ) - { - if( wasLastParsedObjectAnXref ) - { - log.debug( "Skipping some garbage", e ); - //Then we assume that there is just random garbage after - //the xref, not sure why the PDF spec allows this but it does. - } - else - { - //some other error so just pass it along - throw e; - } - } - } - catch( Throwable t ) - { - //so if the PDF is corrupt then close the document and clear - //all resources to it - if( document != null ) - { - document.close(); - } - if( t instanceof IOException ) - { - throw (IOException)t; - } - else - { - throw new WrappedIOException( t ); - } - } - finally - { - pdfSource.close(); - } - } - - /** - * This will skip a header's binary fill bytes. This is in accordance to - * PDF Specification 1.5 pg 68 section 3.4.1 "Syntax.File Structure.File Header" - * - * @throws IOException If there is an error reading from the stream. - */ - protected void skipHeaderFillBytes() throws IOException - { - skipSpaces(); - int c = pdfSource.peek(); - - if( !Character.isDigit( (char)c ) ) - { - // Fill bytes conform with PDF reference (but without comment sign) - // => skip until EOL - readLine(); - } - // else: no fill bytes - } - - /** - * This will get the document that was parsed. parse() must be called before this is called. - * When you are done with this document you must call close() on it to release - * resources. - * - * @return The document that was parsed. - * - * @throws IOException If there is an error getting the document. - */ - public COSDocument getDocument() throws IOException - { - if( document == null ) - { - throw new IOException( "You must call parse() before calling getDocument()" ); - } - return document; - } - - /** - * This will get the PD document that was parsed. When you are done with - * this document you must call close() on it to release resources. - * - * @return The document at the PD layer. - * - * @throws IOException If there is an error getting the document. - */ - public PDDocument getPDDocument() throws IOException - { - return new PDDocument( getDocument() ); - } - - /** - * This will get the FDF document that was parsed. When you are done with - * this document you must call close() on it to release resources. - * - * @return The document at the PD layer. - * - * @throws IOException If there is an error getting the document. - */ - public FDFDocument getFDFDocument() throws IOException - { - return new FDFDocument( getDocument() ); - } - - /** - * This will parse a document object from the stream. - * - * @return The parsed object. - * - * @throws IOException If an IO error occurs. - */ - private Object parseObject() throws IOException - { - Object object = null; - char peekedChar = (char)pdfSource.peek(); - if( log.isDebugEnabled() ) - { - log.debug( "PDFParser.parseObject() peek='" + peekedChar + "'" ); - } - if( pdfSource.isEOF() ) - { - if( log.isDebugEnabled() ) - { - log.debug( "Skipping because of EOF" ); - //end of file we will return a null object and call it a day. - } - } - else if( peekedChar == 'x' || - peekedChar == 't' || - peekedChar == 's') - { - //System.out.println( "parseObject() parsing xref" ); - - //FDF documents do not always have the xref - if( peekedChar == 'x' || peekedChar == 't' ) - { - object = parseXrefSection(); - } - - //if peeked char is xref or startxref - if( peekedChar == 'x' || peekedChar == 's') - { - skipSpaces(); - while( pdfSource.peek() == 'x' ) - { - parseXrefSection(); - } - String startxref = readString(); - if( !startxref.equals( "startxref" ) ) - { - throw new IOException( "expected='startxref' actual='" + startxref + "' " + pdfSource ); - } - skipSpaces(); - //read some integer that is in the stream but PDFBox doesn't use - readInt(); - } - - //This MUST be readLine because readString strips out comments - //and it will think that %% is a comment in from of the EOF - String eof = readExpectedString( "%%EOF" ); - if( eof.indexOf( "%%EOF" )== -1 && !pdfSource.isEOF() ) - { - throw new IOException( "expected='%%EOF' actual='" + eof + "' next=" + readString() + - " next=" +readString() ); - } - else if( !pdfSource.isEOF() ) - { - //we might really be at the end of the file, there might just be some crap at the - //end of the file. - if( pdfSource.available() < 1000 ) - { - //We need to determine if we are at the end of the file. - byte[] data = new byte[ 1000 ]; - - int amountRead = pdfSource.read( data ); - if( amountRead != -1 ) - { - pdfSource.unread( data, 0, amountRead ); - } - boolean atEndOfFile = true;//we assume yes unless we find another. - for( int i=0; i