From 6025b6016517c6d898d8957d1d7e03ba71431912 Mon Sep 17 00:00:00 2001
From: tknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>
Date: Fri, 1 Dec 2006 12:20:24 +0000
Subject: Initial import of release 2.2.

git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@4 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c
---
 src/main/java/org/pdfbox/pdfparser/BaseParser.java | 1369 ++++++++++++++++++++
 .../pdfbox/pdfparser/PDFObjectStreamParser.java    |  137 ++
 src/main/java/org/pdfbox/pdfparser/PDFParser.java  |  557 ++++++++
 .../java/org/pdfbox/pdfparser/PDFStreamParser.java |  403 ++++++
 src/main/java/org/pdfbox/pdfparser/PDFXref.java    |   96 ++
 src/main/java/org/pdfbox/pdfparser/package.html    |    9 +
 6 files changed, 2571 insertions(+)
 create mode 100644 src/main/java/org/pdfbox/pdfparser/BaseParser.java
 create mode 100644 src/main/java/org/pdfbox/pdfparser/PDFObjectStreamParser.java
 create mode 100644 src/main/java/org/pdfbox/pdfparser/PDFParser.java
 create mode 100644 src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
 create mode 100644 src/main/java/org/pdfbox/pdfparser/PDFXref.java
 create mode 100644 src/main/java/org/pdfbox/pdfparser/package.html

(limited to 'src/main/java/org/pdfbox/pdfparser')

diff --git a/src/main/java/org/pdfbox/pdfparser/BaseParser.java b/src/main/java/org/pdfbox/pdfparser/BaseParser.java
new file mode 100644
index 0000000..3937025
--- /dev/null
+++ b/src/main/java/org/pdfbox/pdfparser/BaseParser.java
@@ -0,0 +1,1369 @@
+/**
+ * Copyright (c) 2003-2005, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ *    contributors may be used to endorse or promote products derived from this
+ *    software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.pdfparser;
+
+import java.io.BufferedInputStream;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.RandomAccessFile;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.pdfbox.io.ByteArrayPushBackInputStream;
+import org.pdfbox.io.PushBackInputStream;
+
+import org.pdfbox.cos.COSArray;
+import org.pdfbox.cos.COSBase;
+import org.pdfbox.cos.COSBoolean;
+import org.pdfbox.cos.COSDictionary;
+import org.pdfbox.cos.COSDocument;
+import org.pdfbox.cos.COSInteger;
+import org.pdfbox.cos.COSName;
+import org.pdfbox.cos.COSNull;
+import org.pdfbox.cos.COSNumber;
+import org.pdfbox.cos.COSObject;
+import org.pdfbox.cos.COSStream;
+import org.pdfbox.cos.COSString;
+
+import org.pdfbox.persistence.util.COSObjectKey;
+import org.apache.log4j.Logger;
+
+/**
+ * This class is used to contain parsing logic that will be used by both the
+ * PDFParser and the COSStreamParser.
+ *
+ * @author Ben Litchfield (ben@benlitchfield.com)
+ * @version $Revision: 1.57 $
+ */
+public abstract class BaseParser
+{
+    private static Logger log = Logger.getLogger(BaseParser.class);
+
+    /**
+     * This is a byte array that will be used for comparisons.
+     */
+    public static final byte[] ENDSTREAM = 
+        new byte[] {101,110,100,115,116,114,101,97,109};//"endstream".getBytes( "ISO-8859-1" );
+
+    /**
+     * This is a byte array that will be used for comparisons.
+     */
+    public static final String DEF = "def";
+
+    /**
+     * This is the stream that will be read from.
+     */
+    //protected PushBackByteArrayStream pdfSource;
+    protected PushBackInputStream pdfSource;
+
+    /**
+     * moved xref here, is a persistence construct
+     * maybe not needed anyway when not read from behind with delayed
+     * access to objects.
+     */
+    private List xrefs = new ArrayList();
+
+    private COSDocument document;
+
+    /**
+     * Constructor.
+     *
+     * @param input The input stream to read the data from.
+     * 
+     * @throws IOException If there is an error reading the input stream.
+     */
+    public BaseParser( InputStream input) throws IOException
+    {
+        //pdfSource = new PushBackByteArrayStream( input );
+        pdfSource = new PushBackInputStream( new BufferedInputStream( input, 16384 ), 4096 );
+    }
+    
+    /**
+     * Constructor.
+     *
+     * @param input The array to read the data from.
+     * 
+     * @throws IOException If there is an error reading the byte data.
+     */
+    protected BaseParser(byte[] input) throws IOException
+    {
+        pdfSource = new ByteArrayPushBackInputStream(input);
+    }
+    
+    /**
+     * Set the document for this stream.
+     * 
+     * @param doc The current document.
+     */
+    public void setDocument( COSDocument doc )
+    {
+        document = doc;
+    }
+
+    private static boolean isHexDigit(char ch)
+    {
+        return (ch >= '0' && ch <= '9') || 
+        (ch >= 'a' && ch <= 'f') || 
+        (ch >= 'A' && ch <= 'F');
+        // the line below can lead to problems with certain versions of the IBM JIT compiler
+        // (and is slower anyway)
+        //return (HEXDIGITS.indexOf(ch) != -1);
+    }
+
+    /**
+     * This will parse a PDF dictionary value.
+     *
+     * @return The parsed Dictionary object.
+     *
+     * @throws IOException If there is an error parsing the dictionary object.
+     */
+    private COSBase parseCOSDictionaryValue() throws IOException
+    {
+
+        if( log.isDebugEnabled() )
+        {
+            log.debug("parseCOSDictionaryValue() " + pdfSource );
+        }
+        COSBase retval = null;
+        COSBase number = parseDirObject();
+        skipSpaces();
+        char next = (char)pdfSource.peek();
+        if( next >= '0' && next <= '9' )
+        {
+            COSBase generationNumber = parseDirObject();
+            skipSpaces();
+            char r = (char)pdfSource.read();
+            if( r != 'R' )
+            {
+                throw new IOException( "expected='R' actual='" + r + "' " + pdfSource );
+            }
+            COSObjectKey key = new COSObjectKey(((COSInteger) number).intValue(),
+                                                ((COSInteger) generationNumber).intValue());
+            retval = document.getObjectFromPool(key);
+        }
+        else
+        {
+            retval = number;
+        }
+        return retval;
+    }
+
+    /**
+     * This will parse a PDF dictionary.
+     *
+     * @return The parsed dictionary.
+     *
+     * @throws IOException IF there is an error reading the stream.
+     */
+    protected COSDictionary parseCOSDictionary() throws IOException
+    {
+        if( log.isDebugEnabled() )
+        {
+            log.debug("parseCOSDictionary() " + pdfSource );
+        }
+        char c = (char)pdfSource.read();
+        if( c != '<')
+        {
+            throw new IOException( "expected='<' actual='" + c + "'" );
+        }
+        c = (char)pdfSource.read();
+        if( c != '<')
+        {
+            throw new IOException( "expected='<' actual='" + c + "' " + pdfSource );
+        }
+        skipSpaces();
+        COSDictionary obj = new COSDictionary();
+        boolean done = false;
+        while( !done )
+        {
+            skipSpaces();
+            c = (char)pdfSource.peek();
+            if( c == '>')
+            {
+                done = true;
+            }
+            else
+            {
+                COSName key = parseCOSName();
+                COSBase value = parseCOSDictionaryValue();
+                skipSpaces();
+                if( ((char)pdfSource.peek()) == 'd' )
+                {
+                    //if the next string is 'def' then we are parsing a cmap stream
+                    //and want to ignore it, otherwise throw an exception.
+                    String potentialDEF = readString();
+                    if( !potentialDEF.equals( DEF ) )
+                    {
+                        pdfSource.unread( potentialDEF.getBytes() );
+                    }
+                    else
+                    {
+                        skipSpaces();
+                    }
+                }
+
+                if( value == null )
+                {
+                    throw new IOException("Bad Dictionary Declaration " + pdfSource );
+                }
+                obj.setItem( key, value );
+            }
+        }
+        char ch = (char)pdfSource.read();
+        if( ch != '>' )
+        {
+            throw new IOException( "expected='>' actual='" + ch + "'" );
+        }
+        ch = (char)pdfSource.read();
+        if( ch != '>' )
+        {
+            throw new IOException( "expected='>' actual='" + ch + "'" );
+        }
+        if( log.isDebugEnabled() )
+        {
+            log.debug("parseCOSDictionary() done peek='" + pdfSource.peek() + "'" );
+        }
+        return obj;
+    }
+
+    /**
+     * This will read a COSStream from the input stream.
+     *
+     * @param file The file to write the stream to when reading.
+     * @param dic The dictionary that goes with this stream.
+     *
+     * @return The parsed pdf stream.
+     *
+     * @throws IOException If there is an error reading the stream.
+     */
+    protected COSStream parseCOSStream( COSDictionary dic, RandomAccessFile file ) throws IOException
+    {
+        if( log.isDebugEnabled() )
+        {
+            log.debug("parseCOSStream() " + pdfSource );
+        }
+        COSStream stream = new COSStream( dic, file );
+        OutputStream out = null;
+        try
+        {
+            String streamString = readString();
+            //long streamLength;
+
+            if (!streamString.equals("stream"))
+            {
+                throw new IOException("expected='stream' actual='" + streamString + "'");
+            }
+
+            //PDF Ref 3.2.7 A stream must be followed by either
+            //a CRLF or LF but nothing else.
+
+            int whitespace = pdfSource.read();
+            
+            //see brother_scan_cover.pdf, it adds whitespaces
+            //after the stream but before the start of the 
+            //data, so just read those first
+            while (whitespace == 0x20)
+            {
+                whitespace = pdfSource.read();
+            }
+
+            if( whitespace == 0x0D )
+            {
+                whitespace = pdfSource.read();
+                if( whitespace != 0x0A )
+                {
+                    pdfSource.unread( whitespace );
+                    //The spec says this is invalid but it happens in the real
+                    //world so we must support it.
+                    //throw new IOException("expected='0x0A' actual='0x" +
+                    //    Integer.toHexString(whitespace) + "' " + pdfSource);
+                }
+            }
+            else if (whitespace == 0x0A)
+            {
+                //that is fine
+            }
+            else
+            {
+                //we are in an error.
+                //but again we will do a lenient parsing and just assume that everything
+                //is fine
+                pdfSource.unread( whitespace );
+                //throw new IOException("expected='0x0D or 0x0A' actual='0x" +
+                //Integer.toHexString(whitespace) + "' " + pdfSource);
+
+            }
+
+
+            COSBase streamLength = dic.getDictionaryObject(COSName.LENGTH);
+            long length = -1;
+            if( streamLength instanceof COSNumber )
+            {
+                length = ((COSNumber)streamLength).intValue();
+            }
+            else if( streamLength instanceof COSObject &&
+                     ((COSObject)streamLength).getObject() instanceof COSNumber )
+            {
+                length = ((COSNumber)((COSObject)streamLength).getObject()).intValue();
+            }
+
+            //length = -1;
+            //streamLength = null;
+
+            //Need to keep track of the
+            out = stream.createFilteredStream( streamLength );
+            String endStream = null;
+            //the length is wrong in some pdf documents which means
+            //that PDFBox must basically ignore it in order to be able to read
+            //the most number of PDF documents.  This of course is a penalty hit,
+            //maybe I could implement a faster parser.
+            /**if( length != -1 )
+            {
+                byte[] buffer = new byte[1024];
+                int amountRead = 0;
+                int totalAmountRead = 0;
+                while( amountRead != -1 && totalAmountRead < length )
+                {
+                    int maxAmountToRead = Math.min(buffer.length, (int)(length-totalAmountRead));
+                    amountRead = pdfSource.read(buffer,0,maxAmountToRead);
+                    totalAmountRead += amountRead;
+                    if( amountRead != -1 )
+                    {
+                        out.write( buffer, 0, amountRead );
+                    }
+                }
+            }
+            else
+            {**/
+                readUntilEndStream( out );
+            /**}*/
+            skipSpaces();
+            endStream = readString();
+
+            if (!endStream.equals("endstream"))
+            {
+                readUntilEndStream( out );
+                endStream = readString();
+                if( !endStream.equals( "endstream" ) )
+                {
+                    throw new IOException("expected='endstream' actual='" + endStream + "' " + pdfSource);
+                }
+            }
+        }
+        finally
+        {
+            if( out != null )
+            {
+                out.close();
+            }
+        }
+        if( log.isDebugEnabled() )
+        {
+            log.debug("parseCOSStream() done" );
+        }
+        return stream;
+    }
+
+    private void readUntilEndStream( OutputStream out ) throws IOException
+    {
+        int currentIndex = 0;
+        int byteRead = 0;
+        //this is the additional bytes buffered but not written
+        int additionalBytes=0;
+        byte[] buffer = new byte[ENDSTREAM.length+additionalBytes];
+        int writeIndex = 0;
+        while(!cmpCircularBuffer( buffer, currentIndex, ENDSTREAM ) && byteRead != -1 )
+        {
+            writeIndex = currentIndex - buffer.length;
+            if( writeIndex >= 0 )
+            {
+                out.write( buffer[writeIndex%buffer.length] );
+            }
+            byteRead = pdfSource.read();
+            buffer[currentIndex%buffer.length] = (byte)byteRead;
+            currentIndex++;
+        }
+
+        //we want to ignore the end of the line data when reading a stream
+        //so will make an attempt to ignore it.
+        /*writeIndex = currentIndex - buffer.length;
+        if( buffer[writeIndex%buffer.length] == 13 &&
+            buffer[(writeIndex+1)%buffer.length] == 10 )
+        {
+            //then ignore the newline before the endstream
+        }
+        else if( buffer[(writeIndex+1)%buffer.length] == 10 )
+        {
+            //Then first byte is data, second byte is newline
+            out.write( buffer[writeIndex%buffer.length] );
+        }
+        else
+        {
+            out.write( buffer[writeIndex%buffer.length] );
+            out.write( buffer[(writeIndex+1)%buffer.length] );
+        }*/
+
+        /**
+         * Old way of handling newlines before endstream
+        for( int i=0; i<additionalBytes; i++ )
+        {
+            writeIndex = currentIndex - buffer.length;
+            if( writeIndex >=0 &&
+                //buffer[writeIndex%buffer.length] != 10 &&
+                buffer[writeIndex%buffer.length] != 13 )
+            {
+                out.write( buffer[writeIndex%buffer.length] );
+            }
+            currentIndex++;
+        }
+        */
+        pdfSource.unread( ENDSTREAM );
+
+    }
+
+    /**
+     * This basically checks to see if the next compareTo.length bytes of the
+     * buffer match the compareTo byte array.
+     */
+    private boolean cmpCircularBuffer( byte[] buffer, int currentIndex, byte[] compareTo )
+    {
+        int cmpLen = compareTo.length;
+        int buflen = buffer.length;
+        boolean match = true;
+        int off = currentIndex-cmpLen;
+        if( off < 0 )
+        {
+            match = false;
+        }
+        for( int i=0; match && i<cmpLen; ++i )
+        {
+            match = buffer[(off+i)%buflen] == compareTo[i];
+        }
+        return match;
+    }
+
+    /**
+     * This will parse a PDF string.
+     *
+     * @return The parsed PDF string.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    protected COSString parseCOSString() throws IOException
+    {
+        if( log.isDebugEnabled() )
+        {
+            log.debug("parseCOSString() " + pdfSource );
+        }
+        char nextChar = (char)pdfSource.read();
+        COSString retval = new COSString();
+        char openBrace;
+        char closeBrace;
+        if( nextChar == '(' )
+        {
+            openBrace = '(';
+            closeBrace = ')';
+        }
+        else if( nextChar == '<' )
+        {
+            openBrace = '<';
+            closeBrace = '>';
+        }
+        else
+        {
+            throw new IOException( "parseCOSString string should start with '(' or '<' and not '" +
+                                   nextChar + "' " + pdfSource );
+        }
+
+        //This is the number of braces read
+        //
+        int braces = 1;
+        int c = pdfSource.read();
+        while( braces > 0 && c != -1)
+        {
+            char ch = (char)c;
+            int nextc = -2; // not yet read
+            //if( log.isDebugEnabled() )
+            //{
+            //    log.debug( "Parsing COSString character '" + c + "' code=" + (int)c );
+            //}
+
+            if(ch == closeBrace)
+            {
+                braces--;
+                byte[] nextThreeBytes = new byte[3];
+                int amountRead = pdfSource.read(nextThreeBytes);
+                
+                //lets handle the special case seen in Bull  River Rules and Regulations.pdf
+                //The dictionary looks like this
+                //    2 0 obj
+                //    <<
+                //        /Type /Info
+                //        /Creator (PaperPort http://www.scansoft.com)
+                //        /Producer (sspdflib 1.0 http://www.scansoft.com)
+                //        /Title ( (5)
+                //        /Author ()
+                //        /Subject ()
+                //
+                // Notice the /Title, the braces are not even but they should
+                // be.  So lets assume that if we encounter an this scenario
+                //   <end_brace><new_line><opening_slash> then that
+                // means that there is an error in the pdf and assume that
+                // was the end of the document.
+                if( amountRead == 3 )
+                {
+                    if( nextThreeBytes[0] == 0x0d &&
+                        nextThreeBytes[1] == 0x0a &&
+                        nextThreeBytes[2] == 0x2f )
+                    {
+                        braces = 0;
+                    }
+                }
+                pdfSource.unread( nextThreeBytes, 0, amountRead );
+                if( braces != 0 )
+                {
+                    retval.append( ch );
+                }
+            }
+            else if( ch == openBrace )
+            {
+                braces++;
+                retval.append( ch );
+            }
+            else if( ch == '\\' )
+            {
+                 //patched by ram
+                char next = (char)pdfSource.read();
+                switch(next)
+                {
+                    case 'n':
+                        retval.append( '\n' );
+                        break;
+                    case 'r':
+                        retval.append( '\r' );
+                        break;
+                    case 't':
+                        retval.append( '\t' );
+                        break;
+                    case 'b':
+                        retval.append( '\b' );
+                        break;
+                    case 'f':
+                        retval.append( '\f' );
+                        break;
+                    case '(':
+                    case ')':
+                    case '\\':
+                        retval.append( next );
+                        break;
+                    case 10:
+                    case 13:
+                        //this is a break in the line so ignore it and the newline and continue
+                        c = pdfSource.read();
+                        while( isEOL(c) && c != -1)
+                        {
+                            c = pdfSource.read();
+                        }
+                        nextc = c;
+                        break;
+                    case '0':
+                    case '1':
+                    case '2':
+                    case '3':
+                    case '4':
+                    case '5':
+                    case '6':
+                    case '7':
+                    {
+                        StringBuffer octal = new StringBuffer();
+                        octal.append( next );
+                        c = pdfSource.read();
+                        char digit = (char)c;
+                        if( digit >= '0' && digit <= '7' )
+                        {
+                            octal.append( digit );
+                            c = pdfSource.read();
+                            digit = (char)c;
+                            if( digit >= '0' && digit <= '7' )
+                            {
+                                octal.append( digit );
+                            }
+                            else 
+                            {
+                                nextc = c;
+                            }
+                        }
+                        else
+                        {
+                            nextc = c;
+                        }   
+
+                        int character = 0;
+                        try
+                        {
+                            character = Integer.parseInt( octal.toString(), 8 );
+                        }
+                        catch( NumberFormatException e )
+                        {
+                            throw new IOException( "Error: Expected octal character, actual='" + octal + "'" );
+                        }
+                        retval.append( character );
+                        break;
+                    }
+                    default:
+                    {
+                        retval.append( '\\' );
+                        retval.append( next );
+                        //another ficken problem with PDF's, sometimes the \ doesn't really
+                        //mean escape like the PDF spec says it does, sometimes is should be literal
+                        //which is what we will assume here.
+                        //throw new IOException( "Unexpected break sequence '" + next + "' " + pdfSource );
+                    }
+                }
+            }
+            else
+            {
+                if( openBrace == '<' )
+                {
+                    if( isHexDigit(ch) )
+                    {
+                        retval.append( ch );
+                    }
+                }
+                else
+                {
+                    retval.append( ch );
+                }
+            }
+            if (nextc != -2)
+            {
+                c = nextc;
+            }
+            else 
+            {
+                c = pdfSource.read();
+            }
+        }
+        if (c != -1)
+        {
+            pdfSource.unread(c);
+        }
+        if( openBrace == '<' )
+        {
+            retval = COSString.createFromHexString( retval.getString() );
+        }
+        if( log.isDebugEnabled() )
+        {
+            log.debug("parseCOSString() done parsed=" + retval );
+        }
+        return retval;
+    }
+
+    /**
+     * This will parse a PDF array object.
+     *
+     * @return The parsed PDF array.
+     *
+     * @throws IOException If there is an error parsing the stream.
+     */
+    protected COSArray parseCOSArray() throws IOException
+    {
+        if( log.isDebugEnabled() )
+        {
+            log.debug("parseCOSArray() " + pdfSource );
+        }
+        char ch = (char)pdfSource.read();
+        if( ch != '[')
+        {
+            throw new IOException( "expected='[' actual='" + ch + "'" );
+        }
+        COSArray po = new COSArray();
+        COSBase pbo = null;
+        skipSpaces();
+        int i = 0;
+        while( ((i = pdfSource.peek()) > 0) && ((char)i != ']') )
+        {
+            pbo = parseDirObject();
+            if( pbo instanceof COSObject )
+            {
+                COSInteger genNumber = (COSInteger)po.remove( po.size() -1 );
+                COSInteger number = (COSInteger)po.remove( po.size() -1 );
+                COSObjectKey key = new COSObjectKey(number.intValue(), genNumber.intValue());
+                pbo = document.getObjectFromPool(key);
+            }
+            if( pbo != null )
+            {
+                po.add( pbo );
+            }
+            else
+            {
+                //it could be a bad object in the array which is just skipped
+            }
+            skipSpaces();
+        }
+        pdfSource.read(); //read ']'
+        skipSpaces();
+        if( log.isDebugEnabled() )
+        {
+            log.debug("parseCOSArray() done peek='" + (char)pdfSource.peek() + "'" );
+        }
+        return po;
+    }
+
+    /**
+     * Determine if a character terminates a PDF name.
+     *
+     * @param ch The character
+     * @return <code>true</code> if the character terminates a PDF name, otherwise <code>false</code>.
+     */
+    protected boolean isEndOfName(char ch)
+    {
+        return (ch == ' ' || ch == 13 || ch == 10 || ch == 9 || ch == '>' || ch == '<'
+            || ch == '[' || ch =='/' || ch ==']' || ch ==')' || ch =='(' ||
+            ch == -1 //EOF
+            );
+    }
+
+    /**
+     * This will parse a PDF name from the stream.
+     *
+     * @return The parsed PDF name.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    protected COSName parseCOSName() throws IOException
+    {
+        if( log.isDebugEnabled() )
+        {
+            log.debug("parseCOSName() " + pdfSource );
+        }
+        COSName retval = null;
+        int c = pdfSource.read();
+        if( (char)c != '/')
+        {
+            throw new IOException("expected='/' actual='" + (char)c + "'-" + c + " " + pdfSource );
+        }
+        // costruisce il nome
+        StringBuffer buffer = new StringBuffer();
+        c = pdfSource.read();
+        while( c != -1 )
+        {
+            char ch = (char)c;
+            if(ch == '#')
+            {
+                char ch1 = (char)pdfSource.read();
+                char ch2 = (char)pdfSource.read();
+
+                // Prior to PDF v1.2, the # was not a special character.  Also,
+                // it has been observed that various PDF tools do not follow the
+                // spec with respect to the # escape, even though they report
+                // PDF versions of 1.2 or later.  The solution here is that we
+                // interpret the # as an escape only when it is followed by two
+                // valid hex digits.
+                //
+                if (isHexDigit(ch1) && isHexDigit(ch2))
+                {
+                    String hex = "" + ch1 + ch2;
+                    try
+                    {
+                        buffer.append( (char) Integer.parseInt(hex, 16));
+                    }
+                    catch (NumberFormatException e)
+                    {
+                        if( log.isDebugEnabled() ) 
+                        {
+                            log.debug("isHexDigit(ch1)=" + isHexDigit(ch1) + ", isHexDigit(ch2)=" + isHexDigit(ch2));
+                        }
+                        throw new IOException("Error: expected hex number, actual='" + hex + "'");
+                    }
+                    c = pdfSource.read();
+                }
+                else
+                {
+                    pdfSource.unread(ch2);
+                    c = ch1;
+                    buffer.append( ch );
+                }
+            }
+            else if (isEndOfName(ch))
+            {
+                break;
+            }
+            else
+            {
+                buffer.append( ch );
+                c = pdfSource.read();
+            }
+        }
+        if (c != -1)
+        {
+            pdfSource.unread(c);
+        }
+        retval = COSName.getPDFName( buffer.toString() );
+        return retval;
+    }
+
+    /**
+     * This will parse a boolean object from the stream.
+     *
+     * @return The parsed boolean object.
+     *
+     * @throws IOException If an IO error occurs during parsing.
+     */
+    protected COSBoolean parseBoolean() throws IOException
+    {
+        COSBoolean retval = null;
+        char c = (char)pdfSource.peek();
+        if( c == 't' )
+        {
+            byte[] trueArray = new byte[ 4 ];
+            int amountRead = pdfSource.read( trueArray, 0, 4 );
+            String trueString = new String( trueArray, 0, amountRead );
+            if( !trueString.equals( "true" ) )
+            {
+                throw new IOException( "Error parsing boolean: expected='true' actual='" + trueString + "'" );
+            }
+            else
+            {
+                retval = COSBoolean.TRUE;
+            }
+        }
+        else if( c == 'f' )
+        {
+            byte[] falseArray = new byte[ 5 ];
+            int amountRead = pdfSource.read( falseArray, 0, 5 );
+            String falseString = new String( falseArray, 0, amountRead );
+            if( !falseString.equals( "false" ) )
+            {
+                throw new IOException( "Error parsing boolean: expected='true' actual='" + falseString + "'" );
+            }
+            else
+            {
+                retval = COSBoolean.FALSE;
+            }
+        }
+        else
+        {
+            throw new IOException( "Error parsing boolean expected='t or f' actual='" + c + "'" );
+        }
+        return retval;
+    }
+
+    /**
+     * This will parse a directory object from the stream.
+     *
+     * @return The parsed object.
+     *
+     * @throws IOException If there is an error during parsing.
+     */
+    protected COSBase parseDirObject() throws IOException
+    {
+        if( log.isDebugEnabled() )
+        {
+            log.debug("parseDirObject() " + pdfSource );
+        }
+        COSBase retval = null;
+
+        skipSpaces();
+        int nextByte = pdfSource.peek();
+        char c = (char)nextByte;
+        switch(c)
+        {
+            case '<':
+            {
+                int leftBracket = pdfSource.read();//pull off first left bracket
+                c = (char)pdfSource.peek(); //check for second left bracket
+                pdfSource.unread( leftBracket );
+                if(c == '<')
+                {
+
+                    retval = parseCOSDictionary();
+                    skipSpaces();
+                }
+                else
+                {
+                    retval = parseCOSString();
+                }
+                break;
+            }
+            case '[': // array
+            {
+                retval = parseCOSArray();
+                break;
+            }
+            case '(':
+                retval = parseCOSString();
+                break;
+            case '/':   // name
+                retval = parseCOSName();
+                break;
+            case 'n':   // null
+            {
+                String nullString = readString();
+                if( !nullString.equals( "null") )
+                {
+                    throw new IOException("Expected='null' actual='" + nullString + "'");
+                }
+                retval = COSNull.NULL;
+                break;
+            }
+            case 't':
+            {
+                byte[] trueBytes = new byte[4];
+                int amountRead = pdfSource.read( trueBytes, 0, 4 );
+                String trueString = new String( trueBytes, 0, amountRead );
+                if( trueString.equals( "true" ) )
+                {
+                    retval = COSBoolean.TRUE;
+                }
+                else
+                {
+                    throw new IOException( "expected true actual='" + trueString + "' " + pdfSource );
+                }
+                break;
+            }
+            case 'f':
+            {
+                byte[] falseBytes = new byte[5];
+                int amountRead = pdfSource.read( falseBytes, 0, 5 );
+                String falseString = new String( falseBytes, 0, amountRead );
+                if( falseString.equals( "false" ) )
+                {
+                    retval = COSBoolean.FALSE;
+                }
+                else
+                {
+                    throw new IOException( "expected false actual='" + falseString + "' " + pdfSource );
+                }
+                break;
+            }
+            case 'R':
+                pdfSource.read();
+                retval = new COSObject(null);
+                break;
+            case (char)-1:
+                return null;
+            default:
+            {
+                if( Character.isDigit(c) || c == '-' || c == '+' || c == '.')
+                {
+                    StringBuffer buf = new StringBuffer();
+                    int ic = pdfSource.read();
+                    c = (char)ic;
+                    while( Character.isDigit( c )||
+                           c == '-' ||
+                           c == '+' ||
+                           c == '.' ||
+                           c == 'E' ||
+                           c == 'e' )
+                    {
+                        buf.append( c );
+                        ic = pdfSource.read();
+                        c = (char)ic;
+                    }
+                    if( ic != -1 )
+                    {
+                        pdfSource.unread( ic );
+                    }
+                    retval = COSNumber.get( buf.toString() );
+                }
+                else
+                {
+                    //This is not suppose to happen, but we will allow for it
+                    //so we are more compatible with POS writers that don't
+                    //follow the spec
+                    String badString = readString();
+                    //throw new IOException( "Unknown dir object c='" + c +
+                    //"' peek='" + (char)pdfSource.peek() + "' " + pdfSource );
+                    if( log.isDebugEnabled() )
+                    {
+                        log.debug("parseDirObject() bad DIR object found. ignoring: '" + badString + "'");
+                    }
+                    if( badString == null || badString.length() == 0 )
+                    {
+                        int peek = pdfSource.peek();
+                        // we can end up in an infinite loop otherwise
+                        throw new IOException( "Unknown dir object c='" + c +
+                           "' cInt=" + (int)c + " peek='" + (char)peek + "' peekInt=" + peek + " " + pdfSource );
+                    }
+
+                }
+            }
+        }
+        if( log.isDebugEnabled() )
+        {
+            log.debug("parseDirObject() done retval=" +retval );
+        }
+        return retval;
+    }
+
+    /**
+     * This will read the next string from the stream.
+     *
+     * @return The string that was read from the stream.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    protected String readString() throws IOException
+    {
+        skipSpaces();
+        StringBuffer buffer = new StringBuffer();
+        int c = pdfSource.read();
+        while( !isEndOfName((char)c) && !isClosing(c) && c != -1 )
+        {
+            buffer.append( (char)c );
+            c = pdfSource.read();
+        }
+        if (c != -1)
+        {
+            pdfSource.unread(c);
+        }
+        return buffer.toString();
+    }
+
+    /**
+     * This will read bytes until the end of line marker occurs.
+     *
+     * @param theString The next expected string in the stream.
+     *
+     * @return The characters between the current position and the end of the line.
+     *
+     * @throws IOException If there is an error reading from the stream or theString does not match what was read.
+     */
+    protected String readExpectedString( String theString ) throws IOException
+    {
+        int c = pdfSource.read();
+        while( isWhitespace(c) && c != -1)
+        {
+            c = pdfSource.read();
+        }
+        StringBuffer buffer = new StringBuffer( theString.length() );
+        int charsRead = 0;
+        while( !isEOL(c) && c != -1 && charsRead < theString.length() )
+        {
+            char next = (char)c;
+            buffer.append( next );
+            if( theString.charAt( charsRead ) == next )
+            {
+                charsRead++;
+            }
+            else
+            {
+                throw new IOException( "Error: Expected to read '" + theString +
+                    "' instead started reading '" +buffer.toString() + "'" );
+            }
+            c = pdfSource.read();
+        }
+        while( isEOL(c) && c != -1 )
+        {
+            c = pdfSource.read();
+        }
+        if (c != -1)
+        {
+            pdfSource.unread(c);
+        }
+        return buffer.toString();
+    }
+
+    /**
+     * This will read the next string from the stream up to a certain length.
+     *
+     * @param length The length to stop reading at.
+     *
+     * @return The string that was read from the stream of length 0 to length.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    protected String readString( int length ) throws IOException
+    {
+        skipSpaces();
+
+        int c = pdfSource.read();
+        
+        //average string size is around 2 and the normal string buffer size is
+        //about 16 so lets save some space.
+        StringBuffer buffer = new StringBuffer(length);
+        while( !isWhitespace(c) && !isClosing(c) && c != -1 && buffer.length() < length &&
+            c != '[' &&
+            c != '<' &&
+            c != '(' &&
+            c != '/' )
+        {
+            buffer.append( (char)c );
+            c = pdfSource.read();
+        }
+        if (c != -1)
+        {
+            pdfSource.unread(c);
+        }
+        return buffer.toString();
+    }
+
+    /**
+     * This will tell if the next character is a closing brace( close of PDF array ).
+     *
+     * @return true if the next byte is ']', false otherwise.
+     *
+     * @throws IOException If an IO error occurs.
+     */
+    protected boolean isClosing() throws IOException
+    {
+        return isClosing(pdfSource.peek());
+    }
+    
+    /**
+     * This will tell if the next character is a closing brace( close of PDF array ).
+     *
+     * @param c The character to check against end of line
+     * @return true if the next byte is ']', false otherwise.
+     */
+    protected boolean isClosing(int c) 
+    {
+        return c == ']';
+    }
+
+    /**
+     * This will read bytes until the end of line marker occurs.
+     *
+     * @return The characters between the current position and the end of the line.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    protected String readLine() throws IOException
+    {
+        int c = pdfSource.read();
+        while(isWhitespace(c) && c != -1)
+        {
+            c = pdfSource.read();
+        }
+        StringBuffer buffer = new StringBuffer( 11 );
+        
+        while( !isEOL(c) && c != -1 )
+        {
+            buffer.append( (char)c );
+            c = pdfSource.read();
+        }
+        while( isEOL(c) && c != -1 )
+        {
+            c = pdfSource.read();
+        }
+        if (c != -1)
+        {
+            pdfSource.unread(c);
+        }
+        return buffer.toString();
+    }
+
+    /**
+     * This will tell if the next byte to be read is an end of line byte.
+     *
+     * @return true if the next byte is 0x0A or 0x0D.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    protected boolean isEOL() throws IOException
+    {
+        return isEOL(pdfSource.peek());
+    }
+    
+    /**
+     * This will tell if the next byte to be read is an end of line byte.
+     *
+     * @param c The character to check against end of line
+     * @return true if the next byte is 0x0A or 0x0D.
+     */
+    protected boolean isEOL(int c)
+    {
+        return c == 10 || c == 13;
+    }
+
+    /**
+     * This will tell if the next byte is whitespace or not.
+     *
+     * @return true if the next byte in the stream is a whitespace character.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    protected boolean isWhitespace() throws IOException
+    {
+        return isWhitespace( pdfSource.peek() );
+    }
+
+    /**
+     * This will tell if the next byte is whitespace or not.
+     *
+     * @param c The character to check against whitespace
+     *
+     * @return true if the next byte in the stream is a whitespace character.
+     */
+    protected boolean isWhitespace( int c )
+    {
+        return c == 0 || c == 9 || c == 12  || c == 10
+        || c == 13 || c == 32;
+    }
+
+    /**
+     * This will skip all spaces and comments that are present.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    protected void skipSpaces() throws IOException
+    {
+        //log( "skipSpaces() " + pdfSource );
+        int c = pdfSource.read();
+        // identical to, but faster as: isWhiteSpace(c) || c == 37
+        while(c == 0 || c == 9 || c == 12  || c == 10
+                || c == 13 || c == 32 || c == 37)//37 is the % character, a comment
+        {
+            if ( c == 37 )
+            {
+                // skip past the comment section
+                c = pdfSource.read();
+                while(!isEOL(c) && c != -1)
+                {
+                    c = pdfSource.read();
+                }
+            }
+            else 
+            {
+                c = pdfSource.read();
+            }
+        }
+        if (c != -1)
+        {
+            pdfSource.unread(c);
+        }
+        //log( "skipSpaces() done peek='" + (char)pdfSource.peek() + "'" );
+    }
+
+    /**
+     * this will compare two byte arrays.
+     *
+     * @param first The first byte array to compare.
+     * @param second The second byte array to compare.
+     *
+     * @return true if both arrays are the same AND forall i : first[i] = second[i]
+     */
+    private boolean cmpArray( byte[] first, byte[] second )
+    {
+        return cmpArray( first, 0, second );
+    }
+
+    /**
+     * This will compare two arrays for equality.
+     *
+     * @param first The first array to compare.
+     * @param firstOffset The first byte to start comparing.
+     * @param second The second array to compare.
+     */
+    private boolean cmpArray( byte[] first, int firstOffset, byte[] second )
+    {
+        boolean retval = true;
+        if( first.length-firstOffset >= second.length )
+        {
+            int arrayLength = second.length;
+            for( int i =0; i<arrayLength && retval; i++ )
+            {
+                retval = retval && first[ firstOffset + i ] == second[ i ];
+            }
+        }
+        else
+        {
+            retval = false;
+        }
+        return retval;
+    }
+
+    /**
+     * This will read an integer from the stream.
+     *
+     * @return The integer that was read from the stream.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    protected int readInt() throws IOException
+    {
+        skipSpaces();
+        int retval = 0;
+
+        int lastByte = 0;
+        StringBuffer intBuffer = new StringBuffer();
+        while( (lastByte = pdfSource.read() ) != 32 &&
+        lastByte != 10 &&
+        lastByte != 13 &&
+        lastByte != 0 && //See sourceforge bug 853328
+        lastByte != -1 )
+        {
+            intBuffer.append( (char)lastByte );
+        }
+        try
+        {
+            retval = Integer.parseInt( intBuffer.toString() );
+        }
+        catch( NumberFormatException e )
+        {
+            throw new IOException( "Error: Expected an integer type, actual='" + intBuffer + "'" );
+        }
+        return retval;
+    }
+
+    /**
+     * This will add an xref.
+     *
+     * @param xref The xref to add.
+     */
+    public void addXref( PDFXref xref )
+    {
+        xrefs.add(xref);
+    }
+
+    /**
+     * This will get all of the xrefs.
+     *
+     * @return A list of all xrefs.
+     */
+    public List getXrefs()
+    {
+        return xrefs;
+    }
+
+    /**
+     * This will set the xrefs for this parser.
+     *
+     * @param newXrefs The xrefs for this parser.
+     */
+    private void setXrefs( List newXrefs )
+    {
+        xrefs = newXrefs;
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/pdfbox/pdfparser/PDFObjectStreamParser.java b/src/main/java/org/pdfbox/pdfparser/PDFObjectStreamParser.java
new file mode 100644
index 0000000..6fb7563
--- /dev/null
+++ b/src/main/java/org/pdfbox/pdfparser/PDFObjectStreamParser.java
@@ -0,0 +1,137 @@
+/**
+ * Copyright (c) 2003-2004, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ *    contributors may be used to endorse or promote products derived from this
+ *    software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.pdfparser;
+
+import java.io.IOException;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.pdfbox.cos.COSBase;
+import org.pdfbox.cos.COSDocument;
+import org.pdfbox.cos.COSInteger;
+import org.pdfbox.cos.COSObject;
+import org.pdfbox.cos.COSStream;
+
+
+import org.apache.log4j.Logger;
+
+/**
+ * This will parse a PDF 1.5 object stream and extract all of the objects from the stream.
+ *
+ * @author Ben Litchfield (ben@csh.rit.edu)
+ * @version $Revision: 1.4 $
+ */
+public class PDFObjectStreamParser extends BaseParser
+{
+    private static Logger log = Logger.getLogger(PDFObjectStreamParser.class);
+    private List streamObjects = null;
+    private List objectNumbers = null;
+    private COSStream stream;
+
+    /**
+     * Constructor.
+     *
+     * @param strm The stream to parse.
+     * @param doc The document for the current parsing.
+     *
+     * @throws IOException If there is an error initializing the stream.
+     */
+    public PDFObjectStreamParser( COSStream strm, COSDocument doc ) throws IOException
+    {
+       super( strm.getUnfilteredStream() );
+       setDocument( doc );
+       stream = strm;
+    }
+
+    /**
+     * This will parse the tokens in the stream.  This will close the
+     * stream when it is finished parsing.
+     *
+     * @throws IOException If there is an error while parsing the stream.
+     */
+    public void parse() throws IOException
+    {
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "parse() start" );
+        }
+
+        try
+        {
+            //need to first parse the header.
+            int numberOfObjects = stream.getInt( "N" );
+            objectNumbers = new ArrayList( numberOfObjects );
+            streamObjects = new ArrayList( numberOfObjects );
+            for( int i=0; i<numberOfObjects; i++ )
+            {
+                int objectNumber = readInt();
+                int offset = readInt();
+                if( log.isDebugEnabled() )
+                {
+                    log.debug( "objNum:" + objectNumber + " offset:" + offset );
+                }
+                objectNumbers.add( new Integer( objectNumber ) );
+            }
+            COSObject object = null;
+            COSBase cosObject = null;
+            int objectCounter = 0;
+            while( (cosObject = parseDirObject()) != null )
+            {
+                object = new COSObject(cosObject);
+                object.setGenerationNumber( COSInteger.ZERO );
+                COSInteger objNum = 
+                    new COSInteger( ((Integer)objectNumbers.get( objectCounter)).intValue() );
+                object.setObjectNumber( objNum );
+                streamObjects.add( object );
+                objectCounter++;
+            }
+        }
+        finally
+        {
+            pdfSource.close();
+        }
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "parse() end" );
+        }
+    }
+
+    /**
+     * This will get the objects that were parsed from the stream.
+     *
+     * @return All of the objects in the stream.
+     */
+    public List getObjects()
+    {
+        return streamObjects;
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/pdfbox/pdfparser/PDFParser.java b/src/main/java/org/pdfbox/pdfparser/PDFParser.java
new file mode 100644
index 0000000..d655ef1
--- /dev/null
+++ b/src/main/java/org/pdfbox/pdfparser/PDFParser.java
@@ -0,0 +1,557 @@
+/**
+ * Copyright (c) 2003-2005, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ *    contributors may be used to endorse or promote products derived from this
+ *    software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.pdfparser;
+
+import java.io.File;
+import java.io.RandomAccessFile;
+import java.io.InputStream;
+import java.io.IOException;
+
+import java.util.Iterator;
+
+import org.pdfbox.cos.COSBase;
+import org.pdfbox.cos.COSDictionary;
+import org.pdfbox.cos.COSDocument;
+import org.pdfbox.cos.COSObject;
+import org.pdfbox.cos.COSStream;
+import org.pdfbox.exceptions.WrappedIOException;
+
+import org.pdfbox.pdmodel.PDDocument;
+
+import org.pdfbox.pdmodel.fdf.FDFDocument;
+
+import org.pdfbox.persistence.util.COSObjectKey;
+
+import org.apache.log4j.Logger;
+
+/**
+ * This class will handle the parsing of the PDF document.
+ *
+ * @author Ben Litchfield (ben@benlitchfield.com)
+ * @version $Revision: 1.47 $
+ */
+public class PDFParser extends BaseParser
+{
+    private static Logger log = Logger.getLogger( PDFParser.class );
+    private static final int SPACE_BYTE = 32;
+
+    private static final String PDF_HEADER = "%PDF-";
+    private COSDocument document;
+
+    /**
+     * Temp file directory.
+     */
+    private File tempDirectory = new File( System.getProperty( "java.io.tmpdir" ) );
+
+    private RandomAccessFile raf = null;
+
+    /**
+     * Constructor.
+     *
+     * @param input The input stream that contains the PDF document.
+     *
+     * @throws IOException If there is an error initializing the stream.
+     */
+    public PDFParser( InputStream input ) throws IOException
+    {
+        this(input, null);
+    }
+
+    /**
+     * Constructor to allow control over RandomAccessFile.
+     * @param input The input stream that contains the PDF document.
+     * @param rafi The RandomAccessFile to be used in internal COSDocument
+     *
+     * @throws IOException If there is an error initializing the stream.
+     */
+    public PDFParser(InputStream input, RandomAccessFile rafi)
+        throws IOException
+    {
+        super(input);
+        this.raf = rafi;
+    }
+
+    /**
+     * This is the directory where pdfbox will create a temporary file
+     * for storing pdf document stream in.  By default this directory will
+     * be the value of the system property java.io.tmpdir.
+     *
+     * @param tmpDir The directory to create scratch files needed to store
+     *        pdf document streams.
+     */
+    public void setTempDirectory( File tmpDir )
+    {
+        tempDirectory = tmpDir;
+    }
+
+    /**
+     * This will prase the stream and create the PDF document.  This will close
+     * the stream when it is done parsing.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    public void parse() throws IOException
+    {
+        try
+        {
+            if ( raf == null )
+            {
+                document = new COSDocument( tempDirectory );
+            }
+            else
+            {
+                document = new COSDocument( raf );
+            }
+            setDocument( document );
+            String header = readLine();
+            if( log.isDebugEnabled() )
+            {
+                log.debug( "Header=" + header );
+            }
+            document.setHeaderString( header );
+
+            if( header.length() < PDF_HEADER.length()+1 )
+            {
+                throw new IOException( "Error: Header is corrupt '" + header + "'" );
+            }
+
+            //sometimes there are some garbage bytes in the header before the header
+            //actually starts, so lets try to find the header first.
+            int headerStart = header.indexOf( PDF_HEADER );
+
+            //greater than zero because if it is zero then
+            //there is no point of trimming
+            if( headerStart > 0 )
+            {
+                //trim off any leading characters
+                header = header.substring( headerStart, header.length() );
+            }
+
+            try
+            {
+                float pdfVersion = Float.parseFloat( 
+                    header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER.length()+3) ) );
+                document.setVersion( pdfVersion );
+            }
+            catch( NumberFormatException e )
+            {
+                throw new IOException( "Error getting pdf version:" + e );
+            }
+
+            skipHeaderFillBytes();
+
+
+            Object nextObject;
+            boolean wasLastParsedObjectAnXref = false;
+            try
+            {
+                while( (nextObject = parseObject()) != null )
+                {
+                    if( nextObject instanceof PDFXref )
+                    {
+                        PDFXref xref = (PDFXref)nextObject;
+                        addXref(xref);
+                        wasLastParsedObjectAnXref = true;
+                    }
+                    else
+                    {
+                        wasLastParsedObjectAnXref = false;
+                    }
+                    skipSpaces();
+                }
+                if( document.getTrailer() == null )
+                {
+                    COSDictionary trailer = new COSDictionary();
+                    Iterator xrefIter = document.getObjectsByType( "XRef" ).iterator();
+                    while( xrefIter.hasNext() )
+                    {
+                        COSStream next = (COSStream)((COSObject)xrefIter.next()).getObject();
+                        trailer.addAll( next );
+                    }
+                    document.setTrailer( trailer );
+                }
+                if( !document.isEncrypted() )
+                {
+                    document.dereferenceObjectStreams();
+                }
+            }
+            catch( IOException e )
+            {
+                if( wasLastParsedObjectAnXref )
+                {
+                    log.debug( "Skipping some garbage", e );
+                    //Then we assume that there is just random garbage after
+                    //the xref, not sure why the PDF spec allows this but it does.
+                }
+                else
+                {
+                    //some other error so just pass it along
+                    throw e;
+                }
+            }
+        }
+        catch( Throwable t )
+        {
+            //so if the PDF is corrupt then close the document and clear
+            //all resources to it
+            if( document != null )
+            {
+                document.close();
+            }
+            if( t instanceof IOException )
+            {
+                throw (IOException)t;
+            }
+            else
+            {
+                throw new WrappedIOException( t );
+            }
+        }
+        finally
+        {
+            pdfSource.close();
+        }
+    }
+
+    /**
+     * This will skip a header's binary fill bytes.  This is in accordance to
+     * PDF Specification 1.5 pg 68 section 3.4.1 "Syntax.File Structure.File Header"
+     *
+     * @throws IOException If there is an error reading from the stream.
+    */
+    protected void skipHeaderFillBytes() throws IOException
+    {
+        skipSpaces();
+        int c = pdfSource.peek();
+        
+        if( !Character.isDigit( (char)c ) )
+        {
+            // Fill bytes conform with PDF reference (but without comment sign)
+            // => skip until EOL
+            readLine();
+        }
+        // else: no fill bytes
+    }
+
+    /**
+     * This will get the document that was parsed.  parse() must be called before this is called.
+     * When you are done with this document you must call close() on it to release
+     * resources.
+     *
+     * @return The document that was parsed.
+     *
+     * @throws IOException If there is an error getting the document.
+     */
+    public COSDocument getDocument() throws IOException
+    {
+        if( document == null )
+        {
+            throw new IOException( "You must call parse() before calling getDocument()" );
+        }
+        return document;
+    }
+
+    /**
+     * This will get the PD document that was parsed.  When you are done with
+     * this document you must call close() on it to release resources.
+     *
+     * @return The document at the PD layer.
+     *
+     * @throws IOException If there is an error getting the document.
+     */
+    public PDDocument getPDDocument() throws IOException
+    {
+        return new PDDocument( getDocument() );
+    }
+
+    /**
+     * This will get the FDF document that was parsed.  When you are done with
+     * this document you must call close() on it to release resources.
+     *
+     * @return The document at the PD layer.
+     *
+     * @throws IOException If there is an error getting the document.
+     */
+    public FDFDocument getFDFDocument() throws IOException
+    {
+        return new FDFDocument( getDocument() );
+    }
+
+    /**
+     * This will parse a document object from the stream.
+     *
+     * @return The parsed object.
+     *
+     * @throws IOException If an IO error occurs.
+     */
+    private Object parseObject() throws IOException
+    {
+        Object object = null;
+        char peekedChar = (char)pdfSource.peek();
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "PDFParser.parseObject() peek='" + peekedChar + "'" );
+        }
+        if( pdfSource.isEOF() )
+        {
+            if( log.isDebugEnabled() )
+            {
+                log.debug( "Skipping because of EOF" );
+                //end of file we will return a null object and call it a day.
+            }
+        }
+        else if( peekedChar == 'x' ||
+                 peekedChar == 't' ||
+                 peekedChar == 's')
+        {
+            //System.out.println( "parseObject() parsing xref" );
+
+            //FDF documents do not always have the xref
+            if( peekedChar == 'x' || peekedChar == 't' )
+            {
+                object = parseXrefSection();
+            }
+            
+            //if peeked char is xref or startxref
+            if( peekedChar == 'x' || peekedChar == 's')
+            {
+                skipSpaces();
+                while( pdfSource.peek() == 'x' )
+                {
+                    parseXrefSection();
+                }
+                String startxref = readString();
+                if( !startxref.equals( "startxref" ) )
+                {
+                    throw new IOException( "expected='startxref' actual='" + startxref + "' " + pdfSource );
+                }
+                skipSpaces();
+                //read some integer that is in the stream but PDFBox doesn't use
+                readInt();
+            }
+
+            //This MUST be readLine because readString strips out comments
+            //and it will think that %% is a comment in from of the EOF
+            String eof = readExpectedString( "%%EOF" );
+            if( eof.indexOf( "%%EOF" )== -1 && !pdfSource.isEOF() )
+            {
+                throw new IOException( "expected='%%EOF' actual='" + eof + "' next=" + readString() +
+                                       " next=" +readString() );
+            }
+            else if( !pdfSource.isEOF() )
+            {
+                //we might really be at the end of the file, there might just be some crap at the
+                //end of the file.
+                if( pdfSource.available() < 1000 )
+                {
+                    //We need to determine if we are at the end of the file.
+                    byte[] data = new byte[ 1000 ];
+
+                    int amountRead = pdfSource.read( data );
+                    if( amountRead != -1 )
+                    {
+                        pdfSource.unread( data, 0, amountRead );
+                    }
+                    boolean atEndOfFile = true;//we assume yes unless we find another.
+                    for( int i=0; i<amountRead-3 && atEndOfFile; i++ )
+                    {
+                        atEndOfFile = !(data[i] == 'E' &&
+                                        data[i+1] == 'O' &&
+                                        data[i+2] == 'F' );
+                    }
+                    if( atEndOfFile )
+                    {
+                        while( pdfSource.read( data, 0, data.length ) != -1 )
+                        {
+                            //read until done.
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            int number;
+            int genNum;
+            String objectKey = null;
+            try
+            {
+                number = readInt();
+            }
+            catch( IOException e )
+            {
+                //ok for some reason "GNU Ghostscript 5.10" puts two endobj
+                //statements after an object, of course this is nonsense
+                //but because we want to support as many PDFs as possible
+                //we will simply try again
+                number = readInt();
+            }
+            skipSpaces();
+            genNum = readInt();
+            if( log.isDebugEnabled() )
+            {
+                log.debug( "Parsing object (" + number + "," + genNum + ")" );
+            }
+
+            objectKey = readString( 3 );
+            //System.out.println( "parseObject() num=" + number + " genNumber=" + genNum + " key='" + objectKey + "'" );
+            if( !objectKey.equals( "obj" ) )
+            {
+                throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource );
+            }
+
+            skipSpaces();
+            COSBase pb = parseDirObject();
+            String endObjectKey = readString();
+            if( endObjectKey.equals( "stream" ) )
+            {
+                pdfSource.unread( endObjectKey.getBytes() );
+                pdfSource.unread( ' ' );
+                if( pb instanceof COSDictionary )
+                {
+                    pb = parseCOSStream( (COSDictionary)pb, getDocument().getScratchFile() );
+                }
+                else
+                {
+                    // this is not legal
+                    // the combination of a dict and the stream/endstream forms a complete stream object
+                    throw new IOException("stream not preceded by dictionary");
+                }
+                endObjectKey = readString();
+            }
+            COSObjectKey key = new COSObjectKey( number, genNum );
+            COSObject pdfObject = document.getObjectFromPool( key );
+            object = pdfObject;
+            pdfObject.setObject(pb);
+
+            if( !endObjectKey.equals( "endobj" ) )
+            {
+                if( !pdfSource.isEOF() )
+                {
+                    try
+                    {
+                        //It is possible that the endobj  is missing, there
+                        //are several PDFs out there that do that so skip it and move on.
+                        Float.parseFloat( endObjectKey );
+                        pdfSource.unread( SPACE_BYTE );
+                        pdfSource.unread( endObjectKey.getBytes() );
+                        if( log.isDebugEnabled() )
+                        {
+                            log.debug( "Missing endobj, found '" + endObjectKey +
+                                "' instead, assuming that endobj is not present and will continue parsing." );
+                        }
+                    }
+                    catch( NumberFormatException e )
+                    {
+                        //we will try again incase there was some garbage which
+                        //some writers will leave behind.
+                        String secondEndObjectKey = readString();
+                        if( !secondEndObjectKey.equals( "endobj" ) )
+                        {
+                            throw new IOException("expected='endobj' firstReadAttempt='" + endObjectKey + "' " +
+                                "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource);
+                        }
+                    }
+                }
+            }
+            skipSpaces();
+
+        }
+        //System.out.println( "parsed=" + object );
+        return object;
+    }
+
+
+    /**
+     * This will parse the xref table and trailers from the stream.
+     *
+     * @return a new PDFXref
+     *
+     * @throws IOException If an IO error occurs.
+     */
+    protected PDFXref parseXrefSection() throws IOException
+    {
+        int[] params = new int[2];
+        parseXrefTable(params);
+        parseTrailer();
+
+        return new PDFXref(params[0], params[1]);
+    }
+
+    /**
+     * This will parse the xref table from the stream.
+     *
+     * It stores the starting object number and the count
+     * 
+     * @param params The start and count parameters
+     *
+     * @throws IOException If an IO error occurs.
+     */
+    protected void parseXrefTable(int[] params) throws IOException
+    {
+        String nextLine = null;
+
+        nextLine = readLine();
+        if( nextLine.equals( "xref" ) )
+        {
+            params[0] = readInt();
+            params[1] = readInt();
+            nextLine = readString();
+        }
+        skipSpaces();
+        while( !nextLine.equals( "trailer" ) && !pdfSource.isEOF() && !isEndOfName((char)pdfSource.peek()))
+        {
+            //skip past all the xref entries.
+            nextLine = readString();
+            skipSpaces();
+        }
+        skipSpaces();
+    }
+
+    private void parseTrailer() throws IOException
+    {
+        COSDictionary parsedTrailer = parseCOSDictionary();
+        COSDictionary docTrailer = document.getTrailer();
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "parsedTrailer=" + parsedTrailer );
+            log.debug( "docTrailer=" + docTrailer );
+        }
+        if( docTrailer == null )
+        {
+            document.setTrailer( parsedTrailer );
+        }
+        else
+        {
+            docTrailer.addAll( parsedTrailer );
+        }
+    }
+}
diff --git a/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java b/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
new file mode 100644
index 0000000..d59c5a4
--- /dev/null
+++ b/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
@@ -0,0 +1,403 @@
+/**
+ * Copyright (c) 2003-2004, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ *    contributors may be used to endorse or promote products derived from this
+ *    software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.pdfparser;
+
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.pdfbox.cos.COSBase;
+import org.pdfbox.cos.COSBoolean;
+import org.pdfbox.cos.COSDictionary;
+import org.pdfbox.cos.COSName;
+import org.pdfbox.cos.COSNull;
+import org.pdfbox.cos.COSNumber;
+import org.pdfbox.cos.COSObject;
+import org.pdfbox.cos.COSStream;
+
+import org.pdfbox.util.PDFOperator;
+import org.pdfbox.util.ImageParameters;
+
+import org.apache.log4j.Logger;
+
+/**
+ * This will parse a PDF byte stream and extract operands and such.
+ *
+ * @author Ben Litchfield (ben@csh.rit.edu)
+ * @version $Revision: 1.29 $
+ */
+public class PDFStreamParser extends BaseParser
+{
+    private static Logger log = Logger.getLogger(PDFStreamParser.class);
+    private List streamObjects = new ArrayList( 100 );
+    private RandomAccessFile file;
+    private PDFOperator lastBIToken = null;
+
+    /**
+     * Constructor that takes a stream to parse.
+     *
+     * @param stream The stream to read data from.
+     * @param raf The random access file.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    public PDFStreamParser( InputStream stream, RandomAccessFile raf ) throws IOException
+    {
+        super( stream );
+        file = raf;
+    }
+
+    /**
+     * Constructor.
+     *
+     * @param stream The stream to parse.
+     *
+     * @throws IOException If there is an error initializing the stream.
+     */
+    public PDFStreamParser( COSStream stream ) throws IOException
+    {
+       this( stream.getUnfilteredStream(), stream.getScratchFile() );
+    }
+
+    /**
+     * This will parse the tokens in the stream.  This will close the
+     * stream when it is finished parsing.
+     *
+     * @throws IOException If there is an error while parsing the stream.
+     */
+    public void parse() throws IOException
+    {
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "parse() start" );
+        }
+
+        try
+        {
+            Object token = null;
+            while( (token = parseNextToken()) != null )
+            {
+                streamObjects.add( token );
+            }
+        }
+        finally
+        {
+            pdfSource.close();
+        }
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "parse() end" );
+        }
+    }
+
+    /**
+     * This will get the tokens that were parsed from the stream.
+     *
+     * @return All of the tokens in the stream.
+     */
+    public List getTokens()
+    {
+        return streamObjects;
+    }
+
+    /**
+     * This will parse the next token in the stream.
+     *
+     * @return The next token in the stream or null if there are no more tokens in the stream.
+     *
+     * @throws IOException If an io error occurs while parsing the stream.
+     */
+    private Object parseNextToken() throws IOException
+    {
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "parseNextToken() start" );
+        }
+        Object retval = null;
+
+        skipSpaces();
+        int nextByte = pdfSource.peek();
+        if( ((byte)nextByte) == -1 )
+        {
+            return null;
+        }
+        char c = (char)nextByte;
+        switch(c)
+        {
+            case '<':
+            {
+                int leftBracket = pdfSource.read();//pull off first left bracket
+                c = (char)pdfSource.peek(); //check for second left bracket
+                pdfSource.unread( leftBracket ); //put back first bracket
+                if(c == '<')
+                {
+
+                    COSDictionary pod = parseCOSDictionary();
+                    skipSpaces();
+                    if((char)pdfSource.peek() == 's')
+                    {
+                        retval = parseCOSStream( pod, file );
+                    }
+                    else
+                    {
+                        retval = pod;
+                    }
+                }
+                else
+                {
+                    retval = parseCOSString();
+                }
+                break;
+            }
+            case '[': // array
+            {
+                retval = parseCOSArray();
+                break;
+            }
+            case '(': // string
+                retval = parseCOSString();
+                break;
+            case '/':   // name
+                retval = parseCOSName();
+                break;
+            case 'n':   // null
+            {
+                String nullString = readString();
+                if( nullString.equals( "null") )
+                {
+                    retval = COSNull.NULL;
+                }
+                else
+                {
+                    retval = PDFOperator.getOperator( nullString );
+                }
+                break;
+            }
+            case 't':
+            case 'f':
+            {
+                String next = readString();
+                if( next.equals( "true" ) )
+                {
+                    retval = COSBoolean.TRUE;
+                    break;
+                }
+                else if( next.equals( "false" ) )
+                {
+                    retval = COSBoolean.FALSE;
+                }
+                else
+                {
+                    retval = PDFOperator.getOperator( next );
+                }
+                break;
+            }
+            case 'R':
+            {
+                String line = readString();
+                if( line.equals( "R" ) )
+                {
+                    retval = new COSObject( null );
+                }
+                else
+                {
+                    retval = PDFOperator.getOperator( line );
+                }
+                break;
+            }
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            case '-':
+            case '+':
+            case '.':
+            {
+                if( Character.isDigit(c) || c == '-' || c == '+' || c == '.')
+                {
+                    StringBuffer buf = new StringBuffer();
+                    while( Character.isDigit(( c = (char)pdfSource.peek()) )|| c== '-' || c== '+' || c =='.' )
+                    {
+                        buf.append( c );
+                        pdfSource.read();
+                    }
+                    retval = COSNumber.get( buf.toString() );
+                }
+                else
+                {
+                    throw new IOException( "Unknown dir object c='" + c +
+                        "' peek='" + (char)pdfSource.peek() + "' " + pdfSource );
+                }
+                break;
+            }
+            case 'B':
+            {
+                String next = readString();
+                retval = PDFOperator.getOperator( next );
+
+                if( next.equals( "BI" ) )
+                {
+                    lastBIToken = (PDFOperator)retval;
+                    COSDictionary imageParams = new COSDictionary();
+                    lastBIToken.setImageParameters( new ImageParameters( imageParams ) );
+                    Object nextToken = null;
+                    while( (nextToken = parseNextToken()) instanceof COSName )
+                    {
+                        Object value = parseNextToken();
+                        imageParams.setItem( (COSName)nextToken, (COSBase)value );
+                    }
+                    //final token will be the image data, maybe??
+                    PDFOperator imageData = (PDFOperator)nextToken;
+                    lastBIToken.setImageData( imageData.getImageData() );
+                }
+                break;
+            }
+            case 'I':
+            {
+                ImageParameters imageParams = lastBIToken.getImageParameters();
+                int expectedBytes = (int)Math.ceil(imageParams.getHeight() * imageParams.getWidth() *
+                                    (imageParams.getBitsPerComponent()/8) );
+                //Special case for ID operator
+                String id = "" + (char)pdfSource.read() + (char)pdfSource.read();
+                if( !id.equals( "ID" ) )
+                {
+                    throw new IOException( "Error: Expected operator 'ID' actual='" + id + "'" );
+                }
+                ByteArrayOutputStream imageData = new ByteArrayOutputStream();
+                boolean foundEnd = false;
+                if( this.isWhitespace() )
+                {
+                    //pull off the whitespace character
+                    pdfSource.read();
+                }
+                int twoBytesAgo = 0;
+                int lastByte = pdfSource.read();
+                int currentByte = pdfSource.read();
+                int count = 0;
+                //PDF spec is kinda unclear about this.  Should a whitespace
+                //always appear before EI? Not sure, I found a PDF
+                //(UnderstandingWebSphereClassLoaders.pdf) which has EI as part
+                //of the image data and will stop parsing prematurely if there is
+                //not a check for <whitespace>EI<whitespace>.
+                while( !(isWhitespace( twoBytesAgo ) &&
+                         lastByte == 'E' &&
+                         currentByte == 'I' &&
+                         isWhitespace() //&&
+                         //amyuni2_05d__pdf1_3_acro4x.pdf has image data that
+                         //is compressed, so expectedBytes is useless here.
+                         //count >= expectedBytes
+                         ) &&
+                       !pdfSource.isEOF() )
+                {
+                    imageData.write( lastByte );
+                    twoBytesAgo = lastByte;
+                    lastByte = currentByte;
+                    currentByte = pdfSource.read();
+                    count++;
+                }
+                pdfSource.unread( 'I' ); //unread the EI operator
+                pdfSource.unread( 'E' );
+                retval = PDFOperator.getOperator( "ID" );
+                ((PDFOperator)retval).setImageData( imageData.toByteArray() );
+                break;
+            }
+            case ']':
+            {
+                // some ']' around without its previous '['
+                // this means a PDF is somewhat corrupt but we will continue to parse.
+                pdfSource.read();
+                retval = COSNull.NULL;  // must be a better solution than null...
+                break;
+            }
+            default:
+            {
+                //we must be an operator
+                String operator = readOperator();
+                if( operator.trim().length() == 0 )
+                {
+                    //we have a corrupt stream, stop reading here
+                    retval = null;
+                }
+                else
+                {
+                    retval = PDFOperator.getOperator( operator );
+                }
+            }
+
+        }
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "parseNextToken() retval=" + retval + " peek=" + (char)pdfSource.peek() + " end" );
+        }
+
+        return retval;
+    }
+
+    /**
+     * This will read an operator from the stream.
+     *
+     * @return The operator that was read from the stream.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    protected String readOperator() throws IOException
+    {
+        skipSpaces();
+
+        //average string size is around 2 and the normal string buffer size is
+        //about 16 so lets save some space.
+        StringBuffer buffer = new StringBuffer(4);
+        while(
+            !isWhitespace() &&
+            !isClosing() &&
+            !pdfSource.isEOF() &&
+            pdfSource.peek() != (int)'[' &&
+            pdfSource.peek() != (int)'<' &&
+            pdfSource.peek() != (int)'(' &&
+            pdfSource.peek() != (int)'/' &&
+            (pdfSource.peek() < (int)'0' ||
+             pdfSource.peek() > (int)'9' ) )
+        {
+            buffer.append( (char)pdfSource.read() );
+        }
+        return buffer.toString();
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/pdfbox/pdfparser/PDFXref.java b/src/main/java/org/pdfbox/pdfparser/PDFXref.java
new file mode 100644
index 0000000..abe0f35
--- /dev/null
+++ b/src/main/java/org/pdfbox/pdfparser/PDFXref.java
@@ -0,0 +1,96 @@
+/**
+ * Copyright (c) 2003, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ *    contributors may be used to endorse or promote products derived from this
+ *    software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.pdfparser;
+
+/**
+ * This class represents a PDF xref.
+ *
+ * @author Ben Litchfield (ben@csh.rit.edu)
+ * @version $Revision: 1.3 $
+ */
+public class PDFXref
+{
+
+    private long count;
+    private long start;
+
+    /**
+     * constructor.
+     *
+     * @param startValue The start attribute.
+     * @param countValue The count attribute.
+     */
+    public PDFXref( long startValue, long countValue )
+    {
+        setStart( startValue );
+        setCount( countValue );
+    }
+
+    /**
+     * This will get the count attribute.
+     *
+     * @return The count.
+     */
+    public long getCount()
+    {
+        return count;
+    }
+
+    /**
+     * This will get the start attribute.
+     *
+     * @return The start.
+     */
+    public long getStart()
+    {
+        return start;
+    }
+
+    /**
+     * This will set the count attribute.
+     *
+     * @param newCount The new count.
+     */
+    private void setCount(long newCount)
+    {
+        count = newCount;
+    }
+
+    /**
+     * This will set the start attribute.
+     *
+     * @param newStart The new start attribute.
+     */
+    private void setStart(long newStart)
+    {
+        start = newStart;
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/org/pdfbox/pdfparser/package.html b/src/main/java/org/pdfbox/pdfparser/package.html
new file mode 100644
index 0000000..fe012c1
--- /dev/null
+++ b/src/main/java/org/pdfbox/pdfparser/package.html
@@ -0,0 +1,9 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+<html>
+<head>
+
+</head>
+<body>
+The pdfparser package contains classes to parse PDF documents and objects within the document.
+</body>
+</html>
-- 
cgit v1.2.3