aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/org/pdfbox/pdfparser/PDFParser.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/org/pdfbox/pdfparser/PDFParser.java')
-rw-r--r--src/main/java/org/pdfbox/pdfparser/PDFParser.java557
1 files changed, 557 insertions, 0 deletions
diff --git a/src/main/java/org/pdfbox/pdfparser/PDFParser.java b/src/main/java/org/pdfbox/pdfparser/PDFParser.java
new file mode 100644
index 0000000..d655ef1
--- /dev/null
+++ b/src/main/java/org/pdfbox/pdfparser/PDFParser.java
@@ -0,0 +1,557 @@
+/**
+ * Copyright (c) 2003-2005, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.pdfparser;
+
+import java.io.File;
+import java.io.RandomAccessFile;
+import java.io.InputStream;
+import java.io.IOException;
+
+import java.util.Iterator;
+
+import org.pdfbox.cos.COSBase;
+import org.pdfbox.cos.COSDictionary;
+import org.pdfbox.cos.COSDocument;
+import org.pdfbox.cos.COSObject;
+import org.pdfbox.cos.COSStream;
+import org.pdfbox.exceptions.WrappedIOException;
+
+import org.pdfbox.pdmodel.PDDocument;
+
+import org.pdfbox.pdmodel.fdf.FDFDocument;
+
+import org.pdfbox.persistence.util.COSObjectKey;
+
+import org.apache.log4j.Logger;
+
+/**
+ * This class will handle the parsing of the PDF document.
+ *
+ * @author Ben Litchfield (ben@benlitchfield.com)
+ * @version $Revision: 1.47 $
+ */
+public class PDFParser extends BaseParser
+{
+ private static Logger log = Logger.getLogger( PDFParser.class );
+ private static final int SPACE_BYTE = 32;
+
+ private static final String PDF_HEADER = "%PDF-";
+ private COSDocument document;
+
+ /**
+ * Temp file directory.
+ */
+ private File tempDirectory = new File( System.getProperty( "java.io.tmpdir" ) );
+
+ private RandomAccessFile raf = null;
+
+ /**
+ * Constructor.
+ *
+ * @param input The input stream that contains the PDF document.
+ *
+ * @throws IOException If there is an error initializing the stream.
+ */
+ public PDFParser( InputStream input ) throws IOException
+ {
+ this(input, null);
+ }
+
+ /**
+ * Constructor to allow control over RandomAccessFile.
+ * @param input The input stream that contains the PDF document.
+ * @param rafi The RandomAccessFile to be used in internal COSDocument
+ *
+ * @throws IOException If there is an error initializing the stream.
+ */
+ public PDFParser(InputStream input, RandomAccessFile rafi)
+ throws IOException
+ {
+ super(input);
+ this.raf = rafi;
+ }
+
+ /**
+ * This is the directory where pdfbox will create a temporary file
+ * for storing pdf document stream in. By default this directory will
+ * be the value of the system property java.io.tmpdir.
+ *
+ * @param tmpDir The directory to create scratch files needed to store
+ * pdf document streams.
+ */
+ public void setTempDirectory( File tmpDir )
+ {
+ tempDirectory = tmpDir;
+ }
+
+ /**
+ * This will prase the stream and create the PDF document. This will close
+ * the stream when it is done parsing.
+ *
+ * @throws IOException If there is an error reading from the stream.
+ */
+ public void parse() throws IOException
+ {
+ try
+ {
+ if ( raf == null )
+ {
+ document = new COSDocument( tempDirectory );
+ }
+ else
+ {
+ document = new COSDocument( raf );
+ }
+ setDocument( document );
+ String header = readLine();
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "Header=" + header );
+ }
+ document.setHeaderString( header );
+
+ if( header.length() < PDF_HEADER.length()+1 )
+ {
+ throw new IOException( "Error: Header is corrupt '" + header + "'" );
+ }
+
+ //sometimes there are some garbage bytes in the header before the header
+ //actually starts, so lets try to find the header first.
+ int headerStart = header.indexOf( PDF_HEADER );
+
+ //greater than zero because if it is zero then
+ //there is no point of trimming
+ if( headerStart > 0 )
+ {
+ //trim off any leading characters
+ header = header.substring( headerStart, header.length() );
+ }
+
+ try
+ {
+ float pdfVersion = Float.parseFloat(
+ header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER.length()+3) ) );
+ document.setVersion( pdfVersion );
+ }
+ catch( NumberFormatException e )
+ {
+ throw new IOException( "Error getting pdf version:" + e );
+ }
+
+ skipHeaderFillBytes();
+
+
+ Object nextObject;
+ boolean wasLastParsedObjectAnXref = false;
+ try
+ {
+ while( (nextObject = parseObject()) != null )
+ {
+ if( nextObject instanceof PDFXref )
+ {
+ PDFXref xref = (PDFXref)nextObject;
+ addXref(xref);
+ wasLastParsedObjectAnXref = true;
+ }
+ else
+ {
+ wasLastParsedObjectAnXref = false;
+ }
+ skipSpaces();
+ }
+ if( document.getTrailer() == null )
+ {
+ COSDictionary trailer = new COSDictionary();
+ Iterator xrefIter = document.getObjectsByType( "XRef" ).iterator();
+ while( xrefIter.hasNext() )
+ {
+ COSStream next = (COSStream)((COSObject)xrefIter.next()).getObject();
+ trailer.addAll( next );
+ }
+ document.setTrailer( trailer );
+ }
+ if( !document.isEncrypted() )
+ {
+ document.dereferenceObjectStreams();
+ }
+ }
+ catch( IOException e )
+ {
+ if( wasLastParsedObjectAnXref )
+ {
+ log.debug( "Skipping some garbage", e );
+ //Then we assume that there is just random garbage after
+ //the xref, not sure why the PDF spec allows this but it does.
+ }
+ else
+ {
+ //some other error so just pass it along
+ throw e;
+ }
+ }
+ }
+ catch( Throwable t )
+ {
+ //so if the PDF is corrupt then close the document and clear
+ //all resources to it
+ if( document != null )
+ {
+ document.close();
+ }
+ if( t instanceof IOException )
+ {
+ throw (IOException)t;
+ }
+ else
+ {
+ throw new WrappedIOException( t );
+ }
+ }
+ finally
+ {
+ pdfSource.close();
+ }
+ }
+
+ /**
+ * This will skip a header's binary fill bytes. This is in accordance to
+ * PDF Specification 1.5 pg 68 section 3.4.1 "Syntax.File Structure.File Header"
+ *
+ * @throws IOException If there is an error reading from the stream.
+ */
+ protected void skipHeaderFillBytes() throws IOException
+ {
+ skipSpaces();
+ int c = pdfSource.peek();
+
+ if( !Character.isDigit( (char)c ) )
+ {
+ // Fill bytes conform with PDF reference (but without comment sign)
+ // => skip until EOL
+ readLine();
+ }
+ // else: no fill bytes
+ }
+
+ /**
+ * This will get the document that was parsed. parse() must be called before this is called.
+ * When you are done with this document you must call close() on it to release
+ * resources.
+ *
+ * @return The document that was parsed.
+ *
+ * @throws IOException If there is an error getting the document.
+ */
+ public COSDocument getDocument() throws IOException
+ {
+ if( document == null )
+ {
+ throw new IOException( "You must call parse() before calling getDocument()" );
+ }
+ return document;
+ }
+
+ /**
+ * This will get the PD document that was parsed. When you are done with
+ * this document you must call close() on it to release resources.
+ *
+ * @return The document at the PD layer.
+ *
+ * @throws IOException If there is an error getting the document.
+ */
+ public PDDocument getPDDocument() throws IOException
+ {
+ return new PDDocument( getDocument() );
+ }
+
+ /**
+ * This will get the FDF document that was parsed. When you are done with
+ * this document you must call close() on it to release resources.
+ *
+ * @return The document at the PD layer.
+ *
+ * @throws IOException If there is an error getting the document.
+ */
+ public FDFDocument getFDFDocument() throws IOException
+ {
+ return new FDFDocument( getDocument() );
+ }
+
+ /**
+ * This will parse a document object from the stream.
+ *
+ * @return The parsed object.
+ *
+ * @throws IOException If an IO error occurs.
+ */
+ private Object parseObject() throws IOException
+ {
+ Object object = null;
+ char peekedChar = (char)pdfSource.peek();
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "PDFParser.parseObject() peek='" + peekedChar + "'" );
+ }
+ if( pdfSource.isEOF() )
+ {
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "Skipping because of EOF" );
+ //end of file we will return a null object and call it a day.
+ }
+ }
+ else if( peekedChar == 'x' ||
+ peekedChar == 't' ||
+ peekedChar == 's')
+ {
+ //System.out.println( "parseObject() parsing xref" );
+
+ //FDF documents do not always have the xref
+ if( peekedChar == 'x' || peekedChar == 't' )
+ {
+ object = parseXrefSection();
+ }
+
+ //if peeked char is xref or startxref
+ if( peekedChar == 'x' || peekedChar == 's')
+ {
+ skipSpaces();
+ while( pdfSource.peek() == 'x' )
+ {
+ parseXrefSection();
+ }
+ String startxref = readString();
+ if( !startxref.equals( "startxref" ) )
+ {
+ throw new IOException( "expected='startxref' actual='" + startxref + "' " + pdfSource );
+ }
+ skipSpaces();
+ //read some integer that is in the stream but PDFBox doesn't use
+ readInt();
+ }
+
+ //This MUST be readLine because readString strips out comments
+ //and it will think that %% is a comment in from of the EOF
+ String eof = readExpectedString( "%%EOF" );
+ if( eof.indexOf( "%%EOF" )== -1 && !pdfSource.isEOF() )
+ {
+ throw new IOException( "expected='%%EOF' actual='" + eof + "' next=" + readString() +
+ " next=" +readString() );
+ }
+ else if( !pdfSource.isEOF() )
+ {
+ //we might really be at the end of the file, there might just be some crap at the
+ //end of the file.
+ if( pdfSource.available() < 1000 )
+ {
+ //We need to determine if we are at the end of the file.
+ byte[] data = new byte[ 1000 ];
+
+ int amountRead = pdfSource.read( data );
+ if( amountRead != -1 )
+ {
+ pdfSource.unread( data, 0, amountRead );
+ }
+ boolean atEndOfFile = true;//we assume yes unless we find another.
+ for( int i=0; i<amountRead-3 && atEndOfFile; i++ )
+ {
+ atEndOfFile = !(data[i] == 'E' &&
+ data[i+1] == 'O' &&
+ data[i+2] == 'F' );
+ }
+ if( atEndOfFile )
+ {
+ while( pdfSource.read( data, 0, data.length ) != -1 )
+ {
+ //read until done.
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ int number;
+ int genNum;
+ String objectKey = null;
+ try
+ {
+ number = readInt();
+ }
+ catch( IOException e )
+ {
+ //ok for some reason "GNU Ghostscript 5.10" puts two endobj
+ //statements after an object, of course this is nonsense
+ //but because we want to support as many PDFs as possible
+ //we will simply try again
+ number = readInt();
+ }
+ skipSpaces();
+ genNum = readInt();
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "Parsing object (" + number + "," + genNum + ")" );
+ }
+
+ objectKey = readString( 3 );
+ //System.out.println( "parseObject() num=" + number + " genNumber=" + genNum + " key='" + objectKey + "'" );
+ if( !objectKey.equals( "obj" ) )
+ {
+ throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource );
+ }
+
+ skipSpaces();
+ COSBase pb = parseDirObject();
+ String endObjectKey = readString();
+ if( endObjectKey.equals( "stream" ) )
+ {
+ pdfSource.unread( endObjectKey.getBytes() );
+ pdfSource.unread( ' ' );
+ if( pb instanceof COSDictionary )
+ {
+ pb = parseCOSStream( (COSDictionary)pb, getDocument().getScratchFile() );
+ }
+ else
+ {
+ // this is not legal
+ // the combination of a dict and the stream/endstream forms a complete stream object
+ throw new IOException("stream not preceded by dictionary");
+ }
+ endObjectKey = readString();
+ }
+ COSObjectKey key = new COSObjectKey( number, genNum );
+ COSObject pdfObject = document.getObjectFromPool( key );
+ object = pdfObject;
+ pdfObject.setObject(pb);
+
+ if( !endObjectKey.equals( "endobj" ) )
+ {
+ if( !pdfSource.isEOF() )
+ {
+ try
+ {
+ //It is possible that the endobj is missing, there
+ //are several PDFs out there that do that so skip it and move on.
+ Float.parseFloat( endObjectKey );
+ pdfSource.unread( SPACE_BYTE );
+ pdfSource.unread( endObjectKey.getBytes() );
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "Missing endobj, found '" + endObjectKey +
+ "' instead, assuming that endobj is not present and will continue parsing." );
+ }
+ }
+ catch( NumberFormatException e )
+ {
+ //we will try again incase there was some garbage which
+ //some writers will leave behind.
+ String secondEndObjectKey = readString();
+ if( !secondEndObjectKey.equals( "endobj" ) )
+ {
+ throw new IOException("expected='endobj' firstReadAttempt='" + endObjectKey + "' " +
+ "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource);
+ }
+ }
+ }
+ }
+ skipSpaces();
+
+ }
+ //System.out.println( "parsed=" + object );
+ return object;
+ }
+
+
+ /**
+ * This will parse the xref table and trailers from the stream.
+ *
+ * @return a new PDFXref
+ *
+ * @throws IOException If an IO error occurs.
+ */
+ protected PDFXref parseXrefSection() throws IOException
+ {
+ int[] params = new int[2];
+ parseXrefTable(params);
+ parseTrailer();
+
+ return new PDFXref(params[0], params[1]);
+ }
+
+ /**
+ * This will parse the xref table from the stream.
+ *
+ * It stores the starting object number and the count
+ *
+ * @param params The start and count parameters
+ *
+ * @throws IOException If an IO error occurs.
+ */
+ protected void parseXrefTable(int[] params) throws IOException
+ {
+ String nextLine = null;
+
+ nextLine = readLine();
+ if( nextLine.equals( "xref" ) )
+ {
+ params[0] = readInt();
+ params[1] = readInt();
+ nextLine = readString();
+ }
+ skipSpaces();
+ while( !nextLine.equals( "trailer" ) && !pdfSource.isEOF() && !isEndOfName((char)pdfSource.peek()))
+ {
+ //skip past all the xref entries.
+ nextLine = readString();
+ skipSpaces();
+ }
+ skipSpaces();
+ }
+
+ private void parseTrailer() throws IOException
+ {
+ COSDictionary parsedTrailer = parseCOSDictionary();
+ COSDictionary docTrailer = document.getTrailer();
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "parsedTrailer=" + parsedTrailer );
+ log.debug( "docTrailer=" + docTrailer );
+ }
+ if( docTrailer == null )
+ {
+ document.setTrailer( parsedTrailer );
+ }
+ else
+ {
+ docTrailer.addAll( parsedTrailer );
+ }
+ }
+}