Initial import of release 2.2.REL-2.2@923

git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@4 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c
author: tknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c> 2006-12-01 12:20:24 +0000
committer: tknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c> 2006-12-01 12:20:24 +0000
commit: 6025b6016517c6d898d8957d1d7e03ba71431912 (patch)
tree: b15bd6fa5ffe9588a9bca3f2b8a7e358f83b6eba /src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
parent: d2c77e820ab4aba8235d71275755021347b3ad10 (diff)
download: pdf-as-3-6025b6016517c6d898d8957d1d7e03ba71431912.tar.gz
pdf-as-3-6025b6016517c6d898d8957d1d7e03ba71431912.tar.bz2
pdf-as-3-6025b6016517c6d898d8957d1d7e03ba71431912.zip
1 files changed, 403 insertions, 0 deletions
diff --git a/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java b/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
new file mode 100644
index 0000000..d59c5a4
--- /dev/null
+++ b/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
@@ -0,0 +1,403 @@
+/**
+ * Copyright (c) 2003-2004, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ *    contributors may be used to endorse or promote products derived from this
+ *    software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.pdfparser;
+
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.pdfbox.cos.COSBase;
+import org.pdfbox.cos.COSBoolean;
+import org.pdfbox.cos.COSDictionary;
+import org.pdfbox.cos.COSName;
+import org.pdfbox.cos.COSNull;
+import org.pdfbox.cos.COSNumber;
+import org.pdfbox.cos.COSObject;
+import org.pdfbox.cos.COSStream;
+
+import org.pdfbox.util.PDFOperator;
+import org.pdfbox.util.ImageParameters;
+
+import org.apache.log4j.Logger;
+
+/**
+ * This will parse a PDF byte stream and extract operands and such.
+ *
+ * @author Ben Litchfield (ben@csh.rit.edu)
+ * @version $Revision: 1.29 $
+ */
+public class PDFStreamParser extends BaseParser
+{
+    private static Logger log = Logger.getLogger(PDFStreamParser.class);
+    private List streamObjects = new ArrayList( 100 );
+    private RandomAccessFile file;
+    private PDFOperator lastBIToken = null;
+
+    /**
+     * Constructor that takes a stream to parse.
+     *
+     * @param stream The stream to read data from.
+     * @param raf The random access file.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    public PDFStreamParser( InputStream stream, RandomAccessFile raf ) throws IOException
+    {
+        super( stream );
+        file = raf;
+    }
+
+    /**
+     * Constructor.
+     *
+     * @param stream The stream to parse.
+     *
+     * @throws IOException If there is an error initializing the stream.
+     */
+    public PDFStreamParser( COSStream stream ) throws IOException
+    {
+       this( stream.getUnfilteredStream(), stream.getScratchFile() );
+    }
+
+    /**
+     * This will parse the tokens in the stream.  This will close the
+     * stream when it is finished parsing.
+     *
+     * @throws IOException If there is an error while parsing the stream.
+     */
+    public void parse() throws IOException
+    {
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "parse() start" );
+        }
+
+        try
+        {
+            Object token = null;
+            while( (token = parseNextToken()) != null )
+            {
+                streamObjects.add( token );
+            }
+        }
+        finally
+        {
+            pdfSource.close();
+        }
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "parse() end" );
+        }
+    }
+
+    /**
+     * This will get the tokens that were parsed from the stream.
+     *
+     * @return All of the tokens in the stream.
+     */
+    public List getTokens()
+    {
+        return streamObjects;
+    }
+
+    /**
+     * This will parse the next token in the stream.
+     *
+     * @return The next token in the stream or null if there are no more tokens in the stream.
+     *
+     * @throws IOException If an io error occurs while parsing the stream.
+     */
+    private Object parseNextToken() throws IOException
+    {
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "parseNextToken() start" );
+        }
+        Object retval = null;
+
+        skipSpaces();
+        int nextByte = pdfSource.peek();
+        if( ((byte)nextByte) == -1 )
+        {
+            return null;
+        }
+        char c = (char)nextByte;
+        switch(c)
+        {
+            case '<':
+            {
+                int leftBracket = pdfSource.read();//pull off first left bracket
+                c = (char)pdfSource.peek(); //check for second left bracket
+                pdfSource.unread( leftBracket ); //put back first bracket
+                if(c == '<')
+                {
+
+                    COSDictionary pod = parseCOSDictionary();
+                    skipSpaces();
+                    if((char)pdfSource.peek() == 's')
+                    {
+                        retval = parseCOSStream( pod, file );
+                    }
+                    else
+                    {
+                        retval = pod;
+                    }
+                }
+                else
+                {
+                    retval = parseCOSString();
+                }
+                break;
+            }
+            case '[': // array
+            {
+                retval = parseCOSArray();
+                break;
+            }
+            case '(': // string
+                retval = parseCOSString();
+                break;
+            case '/':   // name
+                retval = parseCOSName();
+                break;
+            case 'n':   // null
+            {
+                String nullString = readString();
+                if( nullString.equals( "null") )
+                {
+                    retval = COSNull.NULL;
+                }
+                else
+                {
+                    retval = PDFOperator.getOperator( nullString );
+                }
+                break;
+            }
+            case 't':
+            case 'f':
+            {
+                String next = readString();
+                if( next.equals( "true" ) )
+                {
+                    retval = COSBoolean.TRUE;
+                    break;
+                }
+                else if( next.equals( "false" ) )
+                {
+                    retval = COSBoolean.FALSE;
+                }
+                else
+                {
+                    retval = PDFOperator.getOperator( next );
+                }
+                break;
+            }
+            case 'R':
+            {
+                String line = readString();
+                if( line.equals( "R" ) )
+                {
+                    retval = new COSObject( null );
+                }
+                else
+                {
+                    retval = PDFOperator.getOperator( line );
+                }
+                break;
+            }
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            case '-':
+            case '+':
+            case '.':
+            {
+                if( Character.isDigit(c) || c == '-' || c == '+' || c == '.')
+                {
+                    StringBuffer buf = new StringBuffer();
+                    while( Character.isDigit(( c = (char)pdfSource.peek()) )|| c== '-' || c== '+' || c =='.' )
+                    {
+                        buf.append( c );
+                        pdfSource.read();
+                    }
+                    retval = COSNumber.get( buf.toString() );
+                }
+                else
+                {
+                    throw new IOException( "Unknown dir object c='" + c +
+                        "' peek='" + (char)pdfSource.peek() + "' " + pdfSource );
+                }
+                break;
+            }
+            case 'B':
+            {
+                String next = readString();
+                retval = PDFOperator.getOperator( next );
+
+                if( next.equals( "BI" ) )
+                {
+                    lastBIToken = (PDFOperator)retval;
+                    COSDictionary imageParams = new COSDictionary();
+                    lastBIToken.setImageParameters( new ImageParameters( imageParams ) );
+                    Object nextToken = null;
+                    while( (nextToken = parseNextToken()) instanceof COSName )
+                    {
+                        Object value = parseNextToken();
+                        imageParams.setItem( (COSName)nextToken, (COSBase)value );
+                    }
+                    //final token will be the image data, maybe??
+                    PDFOperator imageData = (PDFOperator)nextToken;
+                    lastBIToken.setImageData( imageData.getImageData() );
+                }
+                break;
+            }
+            case 'I':
+            {
+                ImageParameters imageParams = lastBIToken.getImageParameters();
+                int expectedBytes = (int)Math.ceil(imageParams.getHeight() * imageParams.getWidth() *
+                                    (imageParams.getBitsPerComponent()/8) );
+                //Special case for ID operator
+                String id = "" + (char)pdfSource.read() + (char)pdfSource.read();
+                if( !id.equals( "ID" ) )
+                {
+                    throw new IOException( "Error: Expected operator 'ID' actual='" + id + "'" );
+                }
+                ByteArrayOutputStream imageData = new ByteArrayOutputStream();
+                boolean foundEnd = false;
+                if( this.isWhitespace() )
+                {
+                    //pull off the whitespace character
+                    pdfSource.read();
+                }
+                int twoBytesAgo = 0;
+                int lastByte = pdfSource.read();
+                int currentByte = pdfSource.read();
+                int count = 0;
+                //PDF spec is kinda unclear about this.  Should a whitespace
+                //always appear before EI? Not sure, I found a PDF
+                //(UnderstandingWebSphereClassLoaders.pdf) which has EI as part
+                //of the image data and will stop parsing prematurely if there is
+                //not a check for <whitespace>EI<whitespace>.
+                while( !(isWhitespace( twoBytesAgo ) &&
+                         lastByte == 'E' &&
+                         currentByte == 'I' &&
+                         isWhitespace() //&&
+                         //amyuni2_05d__pdf1_3_acro4x.pdf has image data that
+                         //is compressed, so expectedBytes is useless here.
+                         //count >= expectedBytes
+                         ) &&
+                       !pdfSource.isEOF() )
+                {
+                    imageData.write( lastByte );
+                    twoBytesAgo = lastByte;
+                    lastByte = currentByte;
+                    currentByte = pdfSource.read();
+                    count++;
+                }
+                pdfSource.unread( 'I' ); //unread the EI operator
+                pdfSource.unread( 'E' );
+                retval = PDFOperator.getOperator( "ID" );
+                ((PDFOperator)retval).setImageData( imageData.toByteArray() );
+                break;
+            }
+            case ']':
+            {
+                // some ']' around without its previous '['
+                // this means a PDF is somewhat corrupt but we will continue to parse.
+                pdfSource.read();
+                retval = COSNull.NULL;  // must be a better solution than null...
+                break;
+            }
+            default:
+            {
+                //we must be an operator
+                String operator = readOperator();
+                if( operator.trim().length() == 0 )
+                {
+                    //we have a corrupt stream, stop reading here
+                    retval = null;
+                }
+                else
+                {
+                    retval = PDFOperator.getOperator( operator );
+                }
+            }
+
+        }
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "parseNextToken() retval=" + retval + " peek=" + (char)pdfSource.peek() + " end" );
+        }
+
+        return retval;
+    }
+
+    /**
+     * This will read an operator from the stream.
+     *
+     * @return The operator that was read from the stream.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    protected String readOperator() throws IOException
+    {
+        skipSpaces();
+
+        //average string size is around 2 and the normal string buffer size is
+        //about 16 so lets save some space.
+        StringBuffer buffer = new StringBuffer(4);
+        while(
+            !isWhitespace() &&
+            !isClosing() &&
+            !pdfSource.isEOF() &&
+            pdfSource.peek() != (int)'[' &&
+            pdfSource.peek() != (int)'<' &&
+            pdfSource.peek() != (int)'(' &&
+            pdfSource.peek() != (int)'/' &&
+            (pdfSource.peek() < (int)'0' ||
+             pdfSource.peek() > (int)'9' ) )
+        {
+            buffer.append( (char)pdfSource.read() );
+        }
+        return buffer.toString();
+    }
+}
+\ No newline at end of file
author	tknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>	2006-12-01 12:20:24 +0000
committer	tknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>	2006-12-01 12:20:24 +0000
commit	6025b6016517c6d898d8957d1d7e03ba71431912 (patch)
tree	b15bd6fa5ffe9588a9bca3f2b8a7e358f83b6eba /src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
parent	d2c77e820ab4aba8235d71275755021347b3ad10 (diff)
download	pdf-as-3-6025b6016517c6d898d8957d1d7e03ba71431912.tar.gz pdf-as-3-6025b6016517c6d898d8957d1d7e03ba71431912.tar.bz2 pdf-as-3-6025b6016517c6d898d8957d1d7e03ba71431912.zip