aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
diff options
context:
space:
mode:
authortknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>2006-12-01 12:20:24 +0000
committertknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>2006-12-01 12:20:24 +0000
commit6025b6016517c6d898d8957d1d7e03ba71431912 (patch)
treeb15bd6fa5ffe9588a9bca3f2b8a7e358f83b6eba /src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
parentd2c77e820ab4aba8235d71275755021347b3ad10 (diff)
downloadpdf-as-3-6025b6016517c6d898d8957d1d7e03ba71431912.tar.gz
pdf-as-3-6025b6016517c6d898d8957d1d7e03ba71431912.tar.bz2
pdf-as-3-6025b6016517c6d898d8957d1d7e03ba71431912.zip
Initial import of release 2.2.REL-2.2@923
git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@4 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c
Diffstat (limited to 'src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java')
-rw-r--r--src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java403
1 files changed, 403 insertions, 0 deletions
diff --git a/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java b/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
new file mode 100644
index 0000000..d59c5a4
--- /dev/null
+++ b/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
@@ -0,0 +1,403 @@
+/**
+ * Copyright (c) 2003-2004, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.pdfparser;
+
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.pdfbox.cos.COSBase;
+import org.pdfbox.cos.COSBoolean;
+import org.pdfbox.cos.COSDictionary;
+import org.pdfbox.cos.COSName;
+import org.pdfbox.cos.COSNull;
+import org.pdfbox.cos.COSNumber;
+import org.pdfbox.cos.COSObject;
+import org.pdfbox.cos.COSStream;
+
+import org.pdfbox.util.PDFOperator;
+import org.pdfbox.util.ImageParameters;
+
+import org.apache.log4j.Logger;
+
+/**
+ * This will parse a PDF byte stream and extract operands and such.
+ *
+ * @author Ben Litchfield (ben@csh.rit.edu)
+ * @version $Revision: 1.29 $
+ */
+public class PDFStreamParser extends BaseParser
+{
+ private static Logger log = Logger.getLogger(PDFStreamParser.class);
+ private List streamObjects = new ArrayList( 100 );
+ private RandomAccessFile file;
+ private PDFOperator lastBIToken = null;
+
+ /**
+ * Constructor that takes a stream to parse.
+ *
+ * @param stream The stream to read data from.
+ * @param raf The random access file.
+ *
+ * @throws IOException If there is an error reading from the stream.
+ */
+ public PDFStreamParser( InputStream stream, RandomAccessFile raf ) throws IOException
+ {
+ super( stream );
+ file = raf;
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param stream The stream to parse.
+ *
+ * @throws IOException If there is an error initializing the stream.
+ */
+ public PDFStreamParser( COSStream stream ) throws IOException
+ {
+ this( stream.getUnfilteredStream(), stream.getScratchFile() );
+ }
+
+ /**
+ * This will parse the tokens in the stream. This will close the
+ * stream when it is finished parsing.
+ *
+ * @throws IOException If there is an error while parsing the stream.
+ */
+ public void parse() throws IOException
+ {
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "parse() start" );
+ }
+
+ try
+ {
+ Object token = null;
+ while( (token = parseNextToken()) != null )
+ {
+ streamObjects.add( token );
+ }
+ }
+ finally
+ {
+ pdfSource.close();
+ }
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "parse() end" );
+ }
+ }
+
+ /**
+ * This will get the tokens that were parsed from the stream.
+ *
+ * @return All of the tokens in the stream.
+ */
+ public List getTokens()
+ {
+ return streamObjects;
+ }
+
+ /**
+ * This will parse the next token in the stream.
+ *
+ * @return The next token in the stream or null if there are no more tokens in the stream.
+ *
+ * @throws IOException If an io error occurs while parsing the stream.
+ */
+ private Object parseNextToken() throws IOException
+ {
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "parseNextToken() start" );
+ }
+ Object retval = null;
+
+ skipSpaces();
+ int nextByte = pdfSource.peek();
+ if( ((byte)nextByte) == -1 )
+ {
+ return null;
+ }
+ char c = (char)nextByte;
+ switch(c)
+ {
+ case '<':
+ {
+ int leftBracket = pdfSource.read();//pull off first left bracket
+ c = (char)pdfSource.peek(); //check for second left bracket
+ pdfSource.unread( leftBracket ); //put back first bracket
+ if(c == '<')
+ {
+
+ COSDictionary pod = parseCOSDictionary();
+ skipSpaces();
+ if((char)pdfSource.peek() == 's')
+ {
+ retval = parseCOSStream( pod, file );
+ }
+ else
+ {
+ retval = pod;
+ }
+ }
+ else
+ {
+ retval = parseCOSString();
+ }
+ break;
+ }
+ case '[': // array
+ {
+ retval = parseCOSArray();
+ break;
+ }
+ case '(': // string
+ retval = parseCOSString();
+ break;
+ case '/': // name
+ retval = parseCOSName();
+ break;
+ case 'n': // null
+ {
+ String nullString = readString();
+ if( nullString.equals( "null") )
+ {
+ retval = COSNull.NULL;
+ }
+ else
+ {
+ retval = PDFOperator.getOperator( nullString );
+ }
+ break;
+ }
+ case 't':
+ case 'f':
+ {
+ String next = readString();
+ if( next.equals( "true" ) )
+ {
+ retval = COSBoolean.TRUE;
+ break;
+ }
+ else if( next.equals( "false" ) )
+ {
+ retval = COSBoolean.FALSE;
+ }
+ else
+ {
+ retval = PDFOperator.getOperator( next );
+ }
+ break;
+ }
+ case 'R':
+ {
+ String line = readString();
+ if( line.equals( "R" ) )
+ {
+ retval = new COSObject( null );
+ }
+ else
+ {
+ retval = PDFOperator.getOperator( line );
+ }
+ break;
+ }
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ case '-':
+ case '+':
+ case '.':
+ {
+ if( Character.isDigit(c) || c == '-' || c == '+' || c == '.')
+ {
+ StringBuffer buf = new StringBuffer();
+ while( Character.isDigit(( c = (char)pdfSource.peek()) )|| c== '-' || c== '+' || c =='.' )
+ {
+ buf.append( c );
+ pdfSource.read();
+ }
+ retval = COSNumber.get( buf.toString() );
+ }
+ else
+ {
+ throw new IOException( "Unknown dir object c='" + c +
+ "' peek='" + (char)pdfSource.peek() + "' " + pdfSource );
+ }
+ break;
+ }
+ case 'B':
+ {
+ String next = readString();
+ retval = PDFOperator.getOperator( next );
+
+ if( next.equals( "BI" ) )
+ {
+ lastBIToken = (PDFOperator)retval;
+ COSDictionary imageParams = new COSDictionary();
+ lastBIToken.setImageParameters( new ImageParameters( imageParams ) );
+ Object nextToken = null;
+ while( (nextToken = parseNextToken()) instanceof COSName )
+ {
+ Object value = parseNextToken();
+ imageParams.setItem( (COSName)nextToken, (COSBase)value );
+ }
+ //final token will be the image data, maybe??
+ PDFOperator imageData = (PDFOperator)nextToken;
+ lastBIToken.setImageData( imageData.getImageData() );
+ }
+ break;
+ }
+ case 'I':
+ {
+ ImageParameters imageParams = lastBIToken.getImageParameters();
+ int expectedBytes = (int)Math.ceil(imageParams.getHeight() * imageParams.getWidth() *
+ (imageParams.getBitsPerComponent()/8) );
+ //Special case for ID operator
+ String id = "" + (char)pdfSource.read() + (char)pdfSource.read();
+ if( !id.equals( "ID" ) )
+ {
+ throw new IOException( "Error: Expected operator 'ID' actual='" + id + "'" );
+ }
+ ByteArrayOutputStream imageData = new ByteArrayOutputStream();
+ boolean foundEnd = false;
+ if( this.isWhitespace() )
+ {
+ //pull off the whitespace character
+ pdfSource.read();
+ }
+ int twoBytesAgo = 0;
+ int lastByte = pdfSource.read();
+ int currentByte = pdfSource.read();
+ int count = 0;
+ //PDF spec is kinda unclear about this. Should a whitespace
+ //always appear before EI? Not sure, I found a PDF
+ //(UnderstandingWebSphereClassLoaders.pdf) which has EI as part
+ //of the image data and will stop parsing prematurely if there is
+ //not a check for <whitespace>EI<whitespace>.
+ while( !(isWhitespace( twoBytesAgo ) &&
+ lastByte == 'E' &&
+ currentByte == 'I' &&
+ isWhitespace() //&&
+ //amyuni2_05d__pdf1_3_acro4x.pdf has image data that
+ //is compressed, so expectedBytes is useless here.
+ //count >= expectedBytes
+ ) &&
+ !pdfSource.isEOF() )
+ {
+ imageData.write( lastByte );
+ twoBytesAgo = lastByte;
+ lastByte = currentByte;
+ currentByte = pdfSource.read();
+ count++;
+ }
+ pdfSource.unread( 'I' ); //unread the EI operator
+ pdfSource.unread( 'E' );
+ retval = PDFOperator.getOperator( "ID" );
+ ((PDFOperator)retval).setImageData( imageData.toByteArray() );
+ break;
+ }
+ case ']':
+ {
+ // some ']' around without its previous '['
+ // this means a PDF is somewhat corrupt but we will continue to parse.
+ pdfSource.read();
+ retval = COSNull.NULL; // must be a better solution than null...
+ break;
+ }
+ default:
+ {
+ //we must be an operator
+ String operator = readOperator();
+ if( operator.trim().length() == 0 )
+ {
+ //we have a corrupt stream, stop reading here
+ retval = null;
+ }
+ else
+ {
+ retval = PDFOperator.getOperator( operator );
+ }
+ }
+
+ }
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "parseNextToken() retval=" + retval + " peek=" + (char)pdfSource.peek() + " end" );
+ }
+
+ return retval;
+ }
+
+ /**
+ * This will read an operator from the stream.
+ *
+ * @return The operator that was read from the stream.
+ *
+ * @throws IOException If there is an error reading from the stream.
+ */
+ protected String readOperator() throws IOException
+ {
+ skipSpaces();
+
+ //average string size is around 2 and the normal string buffer size is
+ //about 16 so lets save some space.
+ StringBuffer buffer = new StringBuffer(4);
+ while(
+ !isWhitespace() &&
+ !isClosing() &&
+ !pdfSource.isEOF() &&
+ pdfSource.peek() != (int)'[' &&
+ pdfSource.peek() != (int)'<' &&
+ pdfSource.peek() != (int)'(' &&
+ pdfSource.peek() != (int)'/' &&
+ (pdfSource.peek() < (int)'0' ||
+ pdfSource.peek() > (int)'9' ) )
+ {
+ buffer.append( (char)pdfSource.read() );
+ }
+ return buffer.toString();
+ }
+} \ No newline at end of file