1 files changed, 557 insertions, 0 deletions
diff --git a/src/main/java/org/pdfbox/pdfparser/PDFParser.java b/src/main/java/org/pdfbox/pdfparser/PDFParser.java
new file mode 100644
index 0000000..d655ef1
--- /dev/null
+++ b/src/main/java/org/pdfbox/pdfparser/PDFParser.java
@@ -0,0 +1,557 @@
+/**
+ * Copyright (c) 2003-2005, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ *    contributors may be used to endorse or promote products derived from this
+ *    software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.pdfparser;
+
+import java.io.File;
+import java.io.RandomAccessFile;
+import java.io.InputStream;
+import java.io.IOException;
+
+import java.util.Iterator;
+
+import org.pdfbox.cos.COSBase;
+import org.pdfbox.cos.COSDictionary;
+import org.pdfbox.cos.COSDocument;
+import org.pdfbox.cos.COSObject;
+import org.pdfbox.cos.COSStream;
+import org.pdfbox.exceptions.WrappedIOException;
+
+import org.pdfbox.pdmodel.PDDocument;
+
+import org.pdfbox.pdmodel.fdf.FDFDocument;
+
+import org.pdfbox.persistence.util.COSObjectKey;
+
+import org.apache.log4j.Logger;
+
+/**
+ * This class will handle the parsing of the PDF document.
+ *
+ * @author Ben Litchfield (ben@benlitchfield.com)
+ * @version $Revision: 1.47 $
+ */
+public class PDFParser extends BaseParser
+{
+    private static Logger log = Logger.getLogger( PDFParser.class );
+    private static final int SPACE_BYTE = 32;
+
+    private static final String PDF_HEADER = "%PDF-";
+    private COSDocument document;
+
+    /**
+     * Temp file directory.
+     */
+    private File tempDirectory = new File( System.getProperty( "java.io.tmpdir" ) );
+
+    private RandomAccessFile raf = null;
+
+    /**
+     * Constructor.
+     *
+     * @param input The input stream that contains the PDF document.
+     *
+     * @throws IOException If there is an error initializing the stream.
+     */
+    public PDFParser( InputStream input ) throws IOException
+    {
+        this(input, null);
+    }
+
+    /**
+     * Constructor to allow control over RandomAccessFile.
+     * @param input The input stream that contains the PDF document.
+     * @param rafi The RandomAccessFile to be used in internal COSDocument
+     *
+     * @throws IOException If there is an error initializing the stream.
+     */
+    public PDFParser(InputStream input, RandomAccessFile rafi)
+        throws IOException
+    {
+        super(input);
+        this.raf = rafi;
+    }
+
+    /**
+     * This is the directory where pdfbox will create a temporary file
+     * for storing pdf document stream in.  By default this directory will
+     * be the value of the system property java.io.tmpdir.
+     *
+     * @param tmpDir The directory to create scratch files needed to store
+     *        pdf document streams.
+     */
+    public void setTempDirectory( File tmpDir )
+    {
+        tempDirectory = tmpDir;
+    }
+
+    /**
+     * This will prase the stream and create the PDF document.  This will close
+     * the stream when it is done parsing.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    public void parse() throws IOException
+    {
+        try
+        {
+            if ( raf == null )
+            {
+                document = new COSDocument( tempDirectory );
+            }
+            else
+            {
+                document = new COSDocument( raf );
+            }
+            setDocument( document );
+            String header = readLine();
+            if( log.isDebugEnabled() )
+            {
+                log.debug( "Header=" + header );
+            }
+            document.setHeaderString( header );
+
+            if( header.length() < PDF_HEADER.length()+1 )
+            {
+                throw new IOException( "Error: Header is corrupt '" + header + "'" );
+            }
+
+            //sometimes there are some garbage bytes in the header before the header
+            //actually starts, so lets try to find the header first.
+            int headerStart = header.indexOf( PDF_HEADER );
+
+            //greater than zero because if it is zero then
+            //there is no point of trimming
+            if( headerStart > 0 )
+            {
+                //trim off any leading characters
+                header = header.substring( headerStart, header.length() );
+            }
+
+            try
+            {
+                float pdfVersion = Float.parseFloat( 
+                    header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER.length()+3) ) );
+                document.setVersion( pdfVersion );
+            }
+            catch( NumberFormatException e )
+            {
+                throw new IOException( "Error getting pdf version:" + e );
+            }
+
+            skipHeaderFillBytes();
+
+
+            Object nextObject;
+            boolean wasLastParsedObjectAnXref = false;
+            try
+            {
+                while( (nextObject = parseObject()) != null )
+                {
+                    if( nextObject instanceof PDFXref )
+                    {
+                        PDFXref xref = (PDFXref)nextObject;
+                        addXref(xref);
+                        wasLastParsedObjectAnXref = true;
+                    }
+                    else
+                    {
+                        wasLastParsedObjectAnXref = false;
+                    }
+                    skipSpaces();
+                }
+                if( document.getTrailer() == null )
+                {
+                    COSDictionary trailer = new COSDictionary();
+                    Iterator xrefIter = document.getObjectsByType( "XRef" ).iterator();
+                    while( xrefIter.hasNext() )
+                    {
+                        COSStream next = (COSStream)((COSObject)xrefIter.next()).getObject();
+                        trailer.addAll( next );
+                    }
+                    document.setTrailer( trailer );
+                }
+                if( !document.isEncrypted() )
+                {
+                    document.dereferenceObjectStreams();
+                }
+            }
+            catch( IOException e )
+            {
+                if( wasLastParsedObjectAnXref )
+                {
+                    log.debug( "Skipping some garbage", e );
+                    //Then we assume that there is just random garbage after
+                    //the xref, not sure why the PDF spec allows this but it does.
+                }
+                else
+                {
+                    //some other error so just pass it along
+                    throw e;
+                }
+            }
+        }
+        catch( Throwable t )
+        {
+            //so if the PDF is corrupt then close the document and clear
+            //all resources to it
+            if( document != null )
+            {
+                document.close();
+            }
+            if( t instanceof IOException )
+            {
+                throw (IOException)t;
+            }
+            else
+            {
+                throw new WrappedIOException( t );
+            }
+        }
+        finally
+        {
+            pdfSource.close();
+        }
+    }
+
+    /**
+     * This will skip a header's binary fill bytes.  This is in accordance to
+     * PDF Specification 1.5 pg 68 section 3.4.1 "Syntax.File Structure.File Header"
+     *
+     * @throws IOException If there is an error reading from the stream.
+    */
+    protected void skipHeaderFillBytes() throws IOException
+    {
+        skipSpaces();
+        int c = pdfSource.peek();
+        
+        if( !Character.isDigit( (char)c ) )
+        {
+            // Fill bytes conform with PDF reference (but without comment sign)
+            // => skip until EOL
+            readLine();
+        }
+        // else: no fill bytes
+    }
+
+    /**
+     * This will get the document that was parsed.  parse() must be called before this is called.
+     * When you are done with this document you must call close() on it to release
+     * resources.
+     *
+     * @return The document that was parsed.
+     *
+     * @throws IOException If there is an error getting the document.
+     */
+    public COSDocument getDocument() throws IOException
+    {
+        if( document == null )
+        {
+            throw new IOException( "You must call parse() before calling getDocument()" );
+        }
+        return document;
+    }
+
+    /**
+     * This will get the PD document that was parsed.  When you are done with
+     * this document you must call close() on it to release resources.
+     *
+     * @return The document at the PD layer.
+     *
+     * @throws IOException If there is an error getting the document.
+     */
+    public PDDocument getPDDocument() throws IOException
+    {
+        return new PDDocument( getDocument() );
+    }
+
+    /**
+     * This will get the FDF document that was parsed.  When you are done with
+     * this document you must call close() on it to release resources.
+     *
+     * @return The document at the PD layer.
+     *
+     * @throws IOException If there is an error getting the document.
+     */
+    public FDFDocument getFDFDocument() throws IOException
+    {
+        return new FDFDocument( getDocument() );
+    }
+
+    /**
+     * This will parse a document object from the stream.
+     *
+     * @return The parsed object.
+     *
+     * @throws IOException If an IO error occurs.
+     */
+    private Object parseObject() throws IOException
+    {
+        Object object = null;
+        char peekedChar = (char)pdfSource.peek();
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "PDFParser.parseObject() peek='" + peekedChar + "'" );
+        }
+        if( pdfSource.isEOF() )
+        {
+            if( log.isDebugEnabled() )
+            {
+                log.debug( "Skipping because of EOF" );
+                //end of file we will return a null object and call it a day.
+            }
+        }
+        else if( peekedChar == 'x' ||
+                 peekedChar == 't' ||
+                 peekedChar == 's')
+        {
+            //System.out.println( "parseObject() parsing xref" );
+
+            //FDF documents do not always have the xref
+            if( peekedChar == 'x' || peekedChar == 't' )
+            {
+                object = parseXrefSection();
+            }
+            
+            //if peeked char is xref or startxref
+            if( peekedChar == 'x' || peekedChar == 's')
+            {
+                skipSpaces();
+                while( pdfSource.peek() == 'x' )
+                {
+                    parseXrefSection();
+                }
+                String startxref = readString();
+                if( !startxref.equals( "startxref" ) )
+                {
+                    throw new IOException( "expected='startxref' actual='" + startxref + "' " + pdfSource );
+                }
+                skipSpaces();
+                //read some integer that is in the stream but PDFBox doesn't use
+                readInt();
+            }
+
+            //This MUST be readLine because readString strips out comments
+            //and it will think that %% is a comment in from of the EOF
+            String eof = readExpectedString( "%%EOF" );
+            if( eof.indexOf( "%%EOF" )== -1 && !pdfSource.isEOF() )
+            {
+                throw new IOException( "expected='%%EOF' actual='" + eof + "' next=" + readString() +
+                                       " next=" +readString() );
+            }
+            else if( !pdfSource.isEOF() )
+            {
+                //we might really be at the end of the file, there might just be some crap at the
+                //end of the file.
+                if( pdfSource.available() < 1000 )
+                {
+                    //We need to determine if we are at the end of the file.
+                    byte[] data = new byte[ 1000 ];
+
+                    int amountRead = pdfSource.read( data );
+                    if( amountRead != -1 )
+                    {
+                        pdfSource.unread( data, 0, amountRead );
+                    }
+                    boolean atEndOfFile = true;//we assume yes unless we find another.
+                    for( int i=0; i<amountRead-3 && atEndOfFile; i++ )
+                    {
+                        atEndOfFile = !(data[i] == 'E' &&
+                                        data[i+1] == 'O' &&
+                                        data[i+2] == 'F' );
+                    }
+                    if( atEndOfFile )
+                    {
+                        while( pdfSource.read( data, 0, data.length ) != -1 )
+                        {
+                            //read until done.
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            int number;
+            int genNum;
+            String objectKey = null;
+            try
+            {
+                number = readInt();
+            }
+            catch( IOException e )
+            {
+                //ok for some reason "GNU Ghostscript 5.10" puts two endobj
+                //statements after an object, of course this is nonsense
+                //but because we want to support as many PDFs as possible
+                //we will simply try again
+                number = readInt();
+            }
+            skipSpaces();
+            genNum = readInt();
+            if( log.isDebugEnabled() )
+            {
+                log.debug( "Parsing object (" + number + "," + genNum + ")" );
+            }
+
+            objectKey = readString( 3 );
+            //System.out.println( "parseObject() num=" + number + " genNumber=" + genNum + " key='" + objectKey + "'" );
+            if( !objectKey.equals( "obj" ) )
+            {
+                throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource );
+            }
+
+            skipSpaces();
+            COSBase pb = parseDirObject();
+            String endObjectKey = readString();
+            if( endObjectKey.equals( "stream" ) )
+            {
+                pdfSource.unread( endObjectKey.getBytes() );
+                pdfSource.unread( ' ' );
+                if( pb instanceof COSDictionary )
+                {
+                    pb = parseCOSStream( (COSDictionary)pb, getDocument().getScratchFile() );
+                }
+                else
+                {
+                    // this is not legal
+                    // the combination of a dict and the stream/endstream forms a complete stream object
+                    throw new IOException("stream not preceded by dictionary");
+                }
+                endObjectKey = readString();
+            }
+            COSObjectKey key = new COSObjectKey( number, genNum );
+            COSObject pdfObject = document.getObjectFromPool( key );
+            object = pdfObject;
+            pdfObject.setObject(pb);
+
+            if( !endObjectKey.equals( "endobj" ) )
+            {
+                if( !pdfSource.isEOF() )
+                {
+                    try
+                    {
+                        //It is possible that the endobj  is missing, there
+                        //are several PDFs out there that do that so skip it and move on.
+                        Float.parseFloat( endObjectKey );
+                        pdfSource.unread( SPACE_BYTE );
+                        pdfSource.unread( endObjectKey.getBytes() );
+                        if( log.isDebugEnabled() )
+                        {
+                            log.debug( "Missing endobj, found '" + endObjectKey +
+                                "' instead, assuming that endobj is not present and will continue parsing." );
+                        }
+                    }
+                    catch( NumberFormatException e )
+                    {
+                        //we will try again incase there was some garbage which
+                        //some writers will leave behind.
+                        String secondEndObjectKey = readString();
+                        if( !secondEndObjectKey.equals( "endobj" ) )
+                        {
+                            throw new IOException("expected='endobj' firstReadAttempt='" + endObjectKey + "' " +
+                                "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource);
+                        }
+                    }
+                }
+            }
+            skipSpaces();
+
+        }
+        //System.out.println( "parsed=" + object );
+        return object;
+    }
+
+
+    /**
+     * This will parse the xref table and trailers from the stream.
+     *
+     * @return a new PDFXref
+     *
+     * @throws IOException If an IO error occurs.
+     */
+    protected PDFXref parseXrefSection() throws IOException
+    {
+        int[] params = new int[2];
+        parseXrefTable(params);
+        parseTrailer();
+
+        return new PDFXref(params[0], params[1]);
+    }
+
+    /**
+     * This will parse the xref table from the stream.
+     *
+     * It stores the starting object number and the count
+     * 
+     * @param params The start and count parameters
+     *
+     * @throws IOException If an IO error occurs.
+     */
+    protected void parseXrefTable(int[] params) throws IOException
+    {
+        String nextLine = null;
+
+        nextLine = readLine();
+        if( nextLine.equals( "xref" ) )
+        {
+            params[0] = readInt();
+            params[1] = readInt();
+            nextLine = readString();
+        }
+        skipSpaces();
+        while( !nextLine.equals( "trailer" ) && !pdfSource.isEOF() && !isEndOfName((char)pdfSource.peek()))
+        {
+            //skip past all the xref entries.
+            nextLine = readString();
+            skipSpaces();
+        }
+        skipSpaces();
+    }
+
+    private void parseTrailer() throws IOException
+    {
+        COSDictionary parsedTrailer = parseCOSDictionary();
+        COSDictionary docTrailer = document.getTrailer();
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "parsedTrailer=" + parsedTrailer );
+            log.debug( "docTrailer=" + docTrailer );
+        }
+        if( docTrailer == null )
+        {
+            document.setTrailer( parsedTrailer );
+        }
+        else
+        {
+            docTrailer.addAll( parsedTrailer );
+        }
+    }
+}