From 6025b6016517c6d898d8957d1d7e03ba71431912 Mon Sep 17 00:00:00 2001 From: tknall Date: Fri, 1 Dec 2006 12:20:24 +0000 Subject: Initial import of release 2.2. git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@4 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../java/org/pdfbox/pdfparser/PDFStreamParser.java | 403 +++++++++++++++++++++ 1 file changed, 403 insertions(+) create mode 100644 src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java (limited to 'src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java') diff --git a/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java b/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java new file mode 100644 index 0000000..d59c5a4 --- /dev/null +++ b/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java @@ -0,0 +1,403 @@ +/** + * Copyright (c) 2003-2004, www.pdfbox.org + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. Neither the name of pdfbox; nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * http://www.pdfbox.org + * + */ +package org.pdfbox.pdfparser; + +import java.io.ByteArrayOutputStream; +import java.io.InputStream; +import java.io.IOException; +import java.io.RandomAccessFile; + +import java.util.ArrayList; +import java.util.List; + +import org.pdfbox.cos.COSBase; +import org.pdfbox.cos.COSBoolean; +import org.pdfbox.cos.COSDictionary; +import org.pdfbox.cos.COSName; +import org.pdfbox.cos.COSNull; +import org.pdfbox.cos.COSNumber; +import org.pdfbox.cos.COSObject; +import org.pdfbox.cos.COSStream; + +import org.pdfbox.util.PDFOperator; +import org.pdfbox.util.ImageParameters; + +import org.apache.log4j.Logger; + +/** + * This will parse a PDF byte stream and extract operands and such. + * + * @author Ben Litchfield (ben@csh.rit.edu) + * @version $Revision: 1.29 $ + */ +public class PDFStreamParser extends BaseParser +{ + private static Logger log = Logger.getLogger(PDFStreamParser.class); + private List streamObjects = new ArrayList( 100 ); + private RandomAccessFile file; + private PDFOperator lastBIToken = null; + + /** + * Constructor that takes a stream to parse. + * + * @param stream The stream to read data from. + * @param raf The random access file. + * + * @throws IOException If there is an error reading from the stream. + */ + public PDFStreamParser( InputStream stream, RandomAccessFile raf ) throws IOException + { + super( stream ); + file = raf; + } + + /** + * Constructor. + * + * @param stream The stream to parse. + * + * @throws IOException If there is an error initializing the stream. + */ + public PDFStreamParser( COSStream stream ) throws IOException + { + this( stream.getUnfilteredStream(), stream.getScratchFile() ); + } + + /** + * This will parse the tokens in the stream. This will close the + * stream when it is finished parsing. + * + * @throws IOException If there is an error while parsing the stream. + */ + public void parse() throws IOException + { + if( log.isDebugEnabled() ) + { + log.debug( "parse() start" ); + } + + try + { + Object token = null; + while( (token = parseNextToken()) != null ) + { + streamObjects.add( token ); + } + } + finally + { + pdfSource.close(); + } + if( log.isDebugEnabled() ) + { + log.debug( "parse() end" ); + } + } + + /** + * This will get the tokens that were parsed from the stream. + * + * @return All of the tokens in the stream. + */ + public List getTokens() + { + return streamObjects; + } + + /** + * This will parse the next token in the stream. + * + * @return The next token in the stream or null if there are no more tokens in the stream. + * + * @throws IOException If an io error occurs while parsing the stream. + */ + private Object parseNextToken() throws IOException + { + if( log.isDebugEnabled() ) + { + log.debug( "parseNextToken() start" ); + } + Object retval = null; + + skipSpaces(); + int nextByte = pdfSource.peek(); + if( ((byte)nextByte) == -1 ) + { + return null; + } + char c = (char)nextByte; + switch(c) + { + case '<': + { + int leftBracket = pdfSource.read();//pull off first left bracket + c = (char)pdfSource.peek(); //check for second left bracket + pdfSource.unread( leftBracket ); //put back first bracket + if(c == '<') + { + + COSDictionary pod = parseCOSDictionary(); + skipSpaces(); + if((char)pdfSource.peek() == 's') + { + retval = parseCOSStream( pod, file ); + } + else + { + retval = pod; + } + } + else + { + retval = parseCOSString(); + } + break; + } + case '[': // array + { + retval = parseCOSArray(); + break; + } + case '(': // string + retval = parseCOSString(); + break; + case '/': // name + retval = parseCOSName(); + break; + case 'n': // null + { + String nullString = readString(); + if( nullString.equals( "null") ) + { + retval = COSNull.NULL; + } + else + { + retval = PDFOperator.getOperator( nullString ); + } + break; + } + case 't': + case 'f': + { + String next = readString(); + if( next.equals( "true" ) ) + { + retval = COSBoolean.TRUE; + break; + } + else if( next.equals( "false" ) ) + { + retval = COSBoolean.FALSE; + } + else + { + retval = PDFOperator.getOperator( next ); + } + break; + } + case 'R': + { + String line = readString(); + if( line.equals( "R" ) ) + { + retval = new COSObject( null ); + } + else + { + retval = PDFOperator.getOperator( line ); + } + break; + } + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '-': + case '+': + case '.': + { + if( Character.isDigit(c) || c == '-' || c == '+' || c == '.') + { + StringBuffer buf = new StringBuffer(); + while( Character.isDigit(( c = (char)pdfSource.peek()) )|| c== '-' || c== '+' || c =='.' ) + { + buf.append( c ); + pdfSource.read(); + } + retval = COSNumber.get( buf.toString() ); + } + else + { + throw new IOException( "Unknown dir object c='" + c + + "' peek='" + (char)pdfSource.peek() + "' " + pdfSource ); + } + break; + } + case 'B': + { + String next = readString(); + retval = PDFOperator.getOperator( next ); + + if( next.equals( "BI" ) ) + { + lastBIToken = (PDFOperator)retval; + COSDictionary imageParams = new COSDictionary(); + lastBIToken.setImageParameters( new ImageParameters( imageParams ) ); + Object nextToken = null; + while( (nextToken = parseNextToken()) instanceof COSName ) + { + Object value = parseNextToken(); + imageParams.setItem( (COSName)nextToken, (COSBase)value ); + } + //final token will be the image data, maybe?? + PDFOperator imageData = (PDFOperator)nextToken; + lastBIToken.setImageData( imageData.getImageData() ); + } + break; + } + case 'I': + { + ImageParameters imageParams = lastBIToken.getImageParameters(); + int expectedBytes = (int)Math.ceil(imageParams.getHeight() * imageParams.getWidth() * + (imageParams.getBitsPerComponent()/8) ); + //Special case for ID operator + String id = "" + (char)pdfSource.read() + (char)pdfSource.read(); + if( !id.equals( "ID" ) ) + { + throw new IOException( "Error: Expected operator 'ID' actual='" + id + "'" ); + } + ByteArrayOutputStream imageData = new ByteArrayOutputStream(); + boolean foundEnd = false; + if( this.isWhitespace() ) + { + //pull off the whitespace character + pdfSource.read(); + } + int twoBytesAgo = 0; + int lastByte = pdfSource.read(); + int currentByte = pdfSource.read(); + int count = 0; + //PDF spec is kinda unclear about this. Should a whitespace + //always appear before EI? Not sure, I found a PDF + //(UnderstandingWebSphereClassLoaders.pdf) which has EI as part + //of the image data and will stop parsing prematurely if there is + //not a check for EI. + while( !(isWhitespace( twoBytesAgo ) && + lastByte == 'E' && + currentByte == 'I' && + isWhitespace() //&& + //amyuni2_05d__pdf1_3_acro4x.pdf has image data that + //is compressed, so expectedBytes is useless here. + //count >= expectedBytes + ) && + !pdfSource.isEOF() ) + { + imageData.write( lastByte ); + twoBytesAgo = lastByte; + lastByte = currentByte; + currentByte = pdfSource.read(); + count++; + } + pdfSource.unread( 'I' ); //unread the EI operator + pdfSource.unread( 'E' ); + retval = PDFOperator.getOperator( "ID" ); + ((PDFOperator)retval).setImageData( imageData.toByteArray() ); + break; + } + case ']': + { + // some ']' around without its previous '[' + // this means a PDF is somewhat corrupt but we will continue to parse. + pdfSource.read(); + retval = COSNull.NULL; // must be a better solution than null... + break; + } + default: + { + //we must be an operator + String operator = readOperator(); + if( operator.trim().length() == 0 ) + { + //we have a corrupt stream, stop reading here + retval = null; + } + else + { + retval = PDFOperator.getOperator( operator ); + } + } + + } + if( log.isDebugEnabled() ) + { + log.debug( "parseNextToken() retval=" + retval + " peek=" + (char)pdfSource.peek() + " end" ); + } + + return retval; + } + + /** + * This will read an operator from the stream. + * + * @return The operator that was read from the stream. + * + * @throws IOException If there is an error reading from the stream. + */ + protected String readOperator() throws IOException + { + skipSpaces(); + + //average string size is around 2 and the normal string buffer size is + //about 16 so lets save some space. + StringBuffer buffer = new StringBuffer(4); + while( + !isWhitespace() && + !isClosing() && + !pdfSource.isEOF() && + pdfSource.peek() != (int)'[' && + pdfSource.peek() != (int)'<' && + pdfSource.peek() != (int)'(' && + pdfSource.peek() != (int)'/' && + (pdfSource.peek() < (int)'0' || + pdfSource.peek() > (int)'9' ) ) + { + buffer.append( (char)pdfSource.read() ); + } + return buffer.toString(); + } +} \ No newline at end of file -- cgit v1.2.3