aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java')
-rw-r--r--src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java403
1 files changed, 0 insertions, 403 deletions
diff --git a/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java b/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
deleted file mode 100644
index d59c5a4..0000000
--- a/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
+++ /dev/null
@@ -1,403 +0,0 @@
-/**
- * Copyright (c) 2003-2004, www.pdfbox.org
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * 3. Neither the name of pdfbox; nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * http://www.pdfbox.org
- *
- */
-package org.pdfbox.pdfparser;
-
-import java.io.ByteArrayOutputStream;
-import java.io.InputStream;
-import java.io.IOException;
-import java.io.RandomAccessFile;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.pdfbox.cos.COSBase;
-import org.pdfbox.cos.COSBoolean;
-import org.pdfbox.cos.COSDictionary;
-import org.pdfbox.cos.COSName;
-import org.pdfbox.cos.COSNull;
-import org.pdfbox.cos.COSNumber;
-import org.pdfbox.cos.COSObject;
-import org.pdfbox.cos.COSStream;
-
-import org.pdfbox.util.PDFOperator;
-import org.pdfbox.util.ImageParameters;
-
-import org.apache.log4j.Logger;
-
-/**
- * This will parse a PDF byte stream and extract operands and such.
- *
- * @author Ben Litchfield (ben@csh.rit.edu)
- * @version $Revision: 1.29 $
- */
-public class PDFStreamParser extends BaseParser
-{
- private static Logger log = Logger.getLogger(PDFStreamParser.class);
- private List streamObjects = new ArrayList( 100 );
- private RandomAccessFile file;
- private PDFOperator lastBIToken = null;
-
- /**
- * Constructor that takes a stream to parse.
- *
- * @param stream The stream to read data from.
- * @param raf The random access file.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- public PDFStreamParser( InputStream stream, RandomAccessFile raf ) throws IOException
- {
- super( stream );
- file = raf;
- }
-
- /**
- * Constructor.
- *
- * @param stream The stream to parse.
- *
- * @throws IOException If there is an error initializing the stream.
- */
- public PDFStreamParser( COSStream stream ) throws IOException
- {
- this( stream.getUnfilteredStream(), stream.getScratchFile() );
- }
-
- /**
- * This will parse the tokens in the stream. This will close the
- * stream when it is finished parsing.
- *
- * @throws IOException If there is an error while parsing the stream.
- */
- public void parse() throws IOException
- {
- if( log.isDebugEnabled() )
- {
- log.debug( "parse() start" );
- }
-
- try
- {
- Object token = null;
- while( (token = parseNextToken()) != null )
- {
- streamObjects.add( token );
- }
- }
- finally
- {
- pdfSource.close();
- }
- if( log.isDebugEnabled() )
- {
- log.debug( "parse() end" );
- }
- }
-
- /**
- * This will get the tokens that were parsed from the stream.
- *
- * @return All of the tokens in the stream.
- */
- public List getTokens()
- {
- return streamObjects;
- }
-
- /**
- * This will parse the next token in the stream.
- *
- * @return The next token in the stream or null if there are no more tokens in the stream.
- *
- * @throws IOException If an io error occurs while parsing the stream.
- */
- private Object parseNextToken() throws IOException
- {
- if( log.isDebugEnabled() )
- {
- log.debug( "parseNextToken() start" );
- }
- Object retval = null;
-
- skipSpaces();
- int nextByte = pdfSource.peek();
- if( ((byte)nextByte) == -1 )
- {
- return null;
- }
- char c = (char)nextByte;
- switch(c)
- {
- case '<':
- {
- int leftBracket = pdfSource.read();//pull off first left bracket
- c = (char)pdfSource.peek(); //check for second left bracket
- pdfSource.unread( leftBracket ); //put back first bracket
- if(c == '<')
- {
-
- COSDictionary pod = parseCOSDictionary();
- skipSpaces();
- if((char)pdfSource.peek() == 's')
- {
- retval = parseCOSStream( pod, file );
- }
- else
- {
- retval = pod;
- }
- }
- else
- {
- retval = parseCOSString();
- }
- break;
- }
- case '[': // array
- {
- retval = parseCOSArray();
- break;
- }
- case '(': // string
- retval = parseCOSString();
- break;
- case '/': // name
- retval = parseCOSName();
- break;
- case 'n': // null
- {
- String nullString = readString();
- if( nullString.equals( "null") )
- {
- retval = COSNull.NULL;
- }
- else
- {
- retval = PDFOperator.getOperator( nullString );
- }
- break;
- }
- case 't':
- case 'f':
- {
- String next = readString();
- if( next.equals( "true" ) )
- {
- retval = COSBoolean.TRUE;
- break;
- }
- else if( next.equals( "false" ) )
- {
- retval = COSBoolean.FALSE;
- }
- else
- {
- retval = PDFOperator.getOperator( next );
- }
- break;
- }
- case 'R':
- {
- String line = readString();
- if( line.equals( "R" ) )
- {
- retval = new COSObject( null );
- }
- else
- {
- retval = PDFOperator.getOperator( line );
- }
- break;
- }
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
- case '-':
- case '+':
- case '.':
- {
- if( Character.isDigit(c) || c == '-' || c == '+' || c == '.')
- {
- StringBuffer buf = new StringBuffer();
- while( Character.isDigit(( c = (char)pdfSource.peek()) )|| c== '-' || c== '+' || c =='.' )
- {
- buf.append( c );
- pdfSource.read();
- }
- retval = COSNumber.get( buf.toString() );
- }
- else
- {
- throw new IOException( "Unknown dir object c='" + c +
- "' peek='" + (char)pdfSource.peek() + "' " + pdfSource );
- }
- break;
- }
- case 'B':
- {
- String next = readString();
- retval = PDFOperator.getOperator( next );
-
- if( next.equals( "BI" ) )
- {
- lastBIToken = (PDFOperator)retval;
- COSDictionary imageParams = new COSDictionary();
- lastBIToken.setImageParameters( new ImageParameters( imageParams ) );
- Object nextToken = null;
- while( (nextToken = parseNextToken()) instanceof COSName )
- {
- Object value = parseNextToken();
- imageParams.setItem( (COSName)nextToken, (COSBase)value );
- }
- //final token will be the image data, maybe??
- PDFOperator imageData = (PDFOperator)nextToken;
- lastBIToken.setImageData( imageData.getImageData() );
- }
- break;
- }
- case 'I':
- {
- ImageParameters imageParams = lastBIToken.getImageParameters();
- int expectedBytes = (int)Math.ceil(imageParams.getHeight() * imageParams.getWidth() *
- (imageParams.getBitsPerComponent()/8) );
- //Special case for ID operator
- String id = "" + (char)pdfSource.read() + (char)pdfSource.read();
- if( !id.equals( "ID" ) )
- {
- throw new IOException( "Error: Expected operator 'ID' actual='" + id + "'" );
- }
- ByteArrayOutputStream imageData = new ByteArrayOutputStream();
- boolean foundEnd = false;
- if( this.isWhitespace() )
- {
- //pull off the whitespace character
- pdfSource.read();
- }
- int twoBytesAgo = 0;
- int lastByte = pdfSource.read();
- int currentByte = pdfSource.read();
- int count = 0;
- //PDF spec is kinda unclear about this. Should a whitespace
- //always appear before EI? Not sure, I found a PDF
- //(UnderstandingWebSphereClassLoaders.pdf) which has EI as part
- //of the image data and will stop parsing prematurely if there is
- //not a check for <whitespace>EI<whitespace>.
- while( !(isWhitespace( twoBytesAgo ) &&
- lastByte == 'E' &&
- currentByte == 'I' &&
- isWhitespace() //&&
- //amyuni2_05d__pdf1_3_acro4x.pdf has image data that
- //is compressed, so expectedBytes is useless here.
- //count >= expectedBytes
- ) &&
- !pdfSource.isEOF() )
- {
- imageData.write( lastByte );
- twoBytesAgo = lastByte;
- lastByte = currentByte;
- currentByte = pdfSource.read();
- count++;
- }
- pdfSource.unread( 'I' ); //unread the EI operator
- pdfSource.unread( 'E' );
- retval = PDFOperator.getOperator( "ID" );
- ((PDFOperator)retval).setImageData( imageData.toByteArray() );
- break;
- }
- case ']':
- {
- // some ']' around without its previous '['
- // this means a PDF is somewhat corrupt but we will continue to parse.
- pdfSource.read();
- retval = COSNull.NULL; // must be a better solution than null...
- break;
- }
- default:
- {
- //we must be an operator
- String operator = readOperator();
- if( operator.trim().length() == 0 )
- {
- //we have a corrupt stream, stop reading here
- retval = null;
- }
- else
- {
- retval = PDFOperator.getOperator( operator );
- }
- }
-
- }
- if( log.isDebugEnabled() )
- {
- log.debug( "parseNextToken() retval=" + retval + " peek=" + (char)pdfSource.peek() + " end" );
- }
-
- return retval;
- }
-
- /**
- * This will read an operator from the stream.
- *
- * @return The operator that was read from the stream.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- protected String readOperator() throws IOException
- {
- skipSpaces();
-
- //average string size is around 2 and the normal string buffer size is
- //about 16 so lets save some space.
- StringBuffer buffer = new StringBuffer(4);
- while(
- !isWhitespace() &&
- !isClosing() &&
- !pdfSource.isEOF() &&
- pdfSource.peek() != (int)'[' &&
- pdfSource.peek() != (int)'<' &&
- pdfSource.peek() != (int)'(' &&
- pdfSource.peek() != (int)'/' &&
- (pdfSource.peek() < (int)'0' ||
- pdfSource.peek() > (int)'9' ) )
- {
- buffer.append( (char)pdfSource.read() );
- }
- return buffer.toString();
- }
-} \ No newline at end of file