aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/org/pdfbox/pdfparser
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/org/pdfbox/pdfparser')
-rw-r--r--src/main/java/org/pdfbox/pdfparser/BaseParser.java1369
-rw-r--r--src/main/java/org/pdfbox/pdfparser/PDFObjectStreamParser.java137
-rw-r--r--src/main/java/org/pdfbox/pdfparser/PDFParser.java557
-rw-r--r--src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java403
-rw-r--r--src/main/java/org/pdfbox/pdfparser/PDFXref.java96
-rw-r--r--src/main/java/org/pdfbox/pdfparser/package.html9
6 files changed, 0 insertions, 2571 deletions
diff --git a/src/main/java/org/pdfbox/pdfparser/BaseParser.java b/src/main/java/org/pdfbox/pdfparser/BaseParser.java
deleted file mode 100644
index 3937025..0000000
--- a/src/main/java/org/pdfbox/pdfparser/BaseParser.java
+++ /dev/null
@@ -1,1369 +0,0 @@
-/**
- * Copyright (c) 2003-2005, www.pdfbox.org
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * 3. Neither the name of pdfbox; nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * http://www.pdfbox.org
- *
- */
-package org.pdfbox.pdfparser;
-
-import java.io.BufferedInputStream;
-import java.io.InputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.io.RandomAccessFile;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.pdfbox.io.ByteArrayPushBackInputStream;
-import org.pdfbox.io.PushBackInputStream;
-
-import org.pdfbox.cos.COSArray;
-import org.pdfbox.cos.COSBase;
-import org.pdfbox.cos.COSBoolean;
-import org.pdfbox.cos.COSDictionary;
-import org.pdfbox.cos.COSDocument;
-import org.pdfbox.cos.COSInteger;
-import org.pdfbox.cos.COSName;
-import org.pdfbox.cos.COSNull;
-import org.pdfbox.cos.COSNumber;
-import org.pdfbox.cos.COSObject;
-import org.pdfbox.cos.COSStream;
-import org.pdfbox.cos.COSString;
-
-import org.pdfbox.persistence.util.COSObjectKey;
-import org.apache.log4j.Logger;
-
-/**
- * This class is used to contain parsing logic that will be used by both the
- * PDFParser and the COSStreamParser.
- *
- * @author Ben Litchfield (ben@benlitchfield.com)
- * @version $Revision: 1.57 $
- */
-public abstract class BaseParser
-{
- private static Logger log = Logger.getLogger(BaseParser.class);
-
- /**
- * This is a byte array that will be used for comparisons.
- */
- public static final byte[] ENDSTREAM =
- new byte[] {101,110,100,115,116,114,101,97,109};//"endstream".getBytes( "ISO-8859-1" );
-
- /**
- * This is a byte array that will be used for comparisons.
- */
- public static final String DEF = "def";
-
- /**
- * This is the stream that will be read from.
- */
- //protected PushBackByteArrayStream pdfSource;
- protected PushBackInputStream pdfSource;
-
- /**
- * moved xref here, is a persistence construct
- * maybe not needed anyway when not read from behind with delayed
- * access to objects.
- */
- private List xrefs = new ArrayList();
-
- private COSDocument document;
-
- /**
- * Constructor.
- *
- * @param input The input stream to read the data from.
- *
- * @throws IOException If there is an error reading the input stream.
- */
- public BaseParser( InputStream input) throws IOException
- {
- //pdfSource = new PushBackByteArrayStream( input );
- pdfSource = new PushBackInputStream( new BufferedInputStream( input, 16384 ), 4096 );
- }
-
- /**
- * Constructor.
- *
- * @param input The array to read the data from.
- *
- * @throws IOException If there is an error reading the byte data.
- */
- protected BaseParser(byte[] input) throws IOException
- {
- pdfSource = new ByteArrayPushBackInputStream(input);
- }
-
- /**
- * Set the document for this stream.
- *
- * @param doc The current document.
- */
- public void setDocument( COSDocument doc )
- {
- document = doc;
- }
-
- private static boolean isHexDigit(char ch)
- {
- return (ch >= '0' && ch <= '9') ||
- (ch >= 'a' && ch <= 'f') ||
- (ch >= 'A' && ch <= 'F');
- // the line below can lead to problems with certain versions of the IBM JIT compiler
- // (and is slower anyway)
- //return (HEXDIGITS.indexOf(ch) != -1);
- }
-
- /**
- * This will parse a PDF dictionary value.
- *
- * @return The parsed Dictionary object.
- *
- * @throws IOException If there is an error parsing the dictionary object.
- */
- private COSBase parseCOSDictionaryValue() throws IOException
- {
-
- if( log.isDebugEnabled() )
- {
- log.debug("parseCOSDictionaryValue() " + pdfSource );
- }
- COSBase retval = null;
- COSBase number = parseDirObject();
- skipSpaces();
- char next = (char)pdfSource.peek();
- if( next >= '0' && next <= '9' )
- {
- COSBase generationNumber = parseDirObject();
- skipSpaces();
- char r = (char)pdfSource.read();
- if( r != 'R' )
- {
- throw new IOException( "expected='R' actual='" + r + "' " + pdfSource );
- }
- COSObjectKey key = new COSObjectKey(((COSInteger) number).intValue(),
- ((COSInteger) generationNumber).intValue());
- retval = document.getObjectFromPool(key);
- }
- else
- {
- retval = number;
- }
- return retval;
- }
-
- /**
- * This will parse a PDF dictionary.
- *
- * @return The parsed dictionary.
- *
- * @throws IOException IF there is an error reading the stream.
- */
- protected COSDictionary parseCOSDictionary() throws IOException
- {
- if( log.isDebugEnabled() )
- {
- log.debug("parseCOSDictionary() " + pdfSource );
- }
- char c = (char)pdfSource.read();
- if( c != '<')
- {
- throw new IOException( "expected='<' actual='" + c + "'" );
- }
- c = (char)pdfSource.read();
- if( c != '<')
- {
- throw new IOException( "expected='<' actual='" + c + "' " + pdfSource );
- }
- skipSpaces();
- COSDictionary obj = new COSDictionary();
- boolean done = false;
- while( !done )
- {
- skipSpaces();
- c = (char)pdfSource.peek();
- if( c == '>')
- {
- done = true;
- }
- else
- {
- COSName key = parseCOSName();
- COSBase value = parseCOSDictionaryValue();
- skipSpaces();
- if( ((char)pdfSource.peek()) == 'd' )
- {
- //if the next string is 'def' then we are parsing a cmap stream
- //and want to ignore it, otherwise throw an exception.
- String potentialDEF = readString();
- if( !potentialDEF.equals( DEF ) )
- {
- pdfSource.unread( potentialDEF.getBytes() );
- }
- else
- {
- skipSpaces();
- }
- }
-
- if( value == null )
- {
- throw new IOException("Bad Dictionary Declaration " + pdfSource );
- }
- obj.setItem( key, value );
- }
- }
- char ch = (char)pdfSource.read();
- if( ch != '>' )
- {
- throw new IOException( "expected='>' actual='" + ch + "'" );
- }
- ch = (char)pdfSource.read();
- if( ch != '>' )
- {
- throw new IOException( "expected='>' actual='" + ch + "'" );
- }
- if( log.isDebugEnabled() )
- {
- log.debug("parseCOSDictionary() done peek='" + pdfSource.peek() + "'" );
- }
- return obj;
- }
-
- /**
- * This will read a COSStream from the input stream.
- *
- * @param file The file to write the stream to when reading.
- * @param dic The dictionary that goes with this stream.
- *
- * @return The parsed pdf stream.
- *
- * @throws IOException If there is an error reading the stream.
- */
- protected COSStream parseCOSStream( COSDictionary dic, RandomAccessFile file ) throws IOException
- {
- if( log.isDebugEnabled() )
- {
- log.debug("parseCOSStream() " + pdfSource );
- }
- COSStream stream = new COSStream( dic, file );
- OutputStream out = null;
- try
- {
- String streamString = readString();
- //long streamLength;
-
- if (!streamString.equals("stream"))
- {
- throw new IOException("expected='stream' actual='" + streamString + "'");
- }
-
- //PDF Ref 3.2.7 A stream must be followed by either
- //a CRLF or LF but nothing else.
-
- int whitespace = pdfSource.read();
-
- //see brother_scan_cover.pdf, it adds whitespaces
- //after the stream but before the start of the
- //data, so just read those first
- while (whitespace == 0x20)
- {
- whitespace = pdfSource.read();
- }
-
- if( whitespace == 0x0D )
- {
- whitespace = pdfSource.read();
- if( whitespace != 0x0A )
- {
- pdfSource.unread( whitespace );
- //The spec says this is invalid but it happens in the real
- //world so we must support it.
- //throw new IOException("expected='0x0A' actual='0x" +
- // Integer.toHexString(whitespace) + "' " + pdfSource);
- }
- }
- else if (whitespace == 0x0A)
- {
- //that is fine
- }
- else
- {
- //we are in an error.
- //but again we will do a lenient parsing and just assume that everything
- //is fine
- pdfSource.unread( whitespace );
- //throw new IOException("expected='0x0D or 0x0A' actual='0x" +
- //Integer.toHexString(whitespace) + "' " + pdfSource);
-
- }
-
-
- COSBase streamLength = dic.getDictionaryObject(COSName.LENGTH);
- long length = -1;
- if( streamLength instanceof COSNumber )
- {
- length = ((COSNumber)streamLength).intValue();
- }
- else if( streamLength instanceof COSObject &&
- ((COSObject)streamLength).getObject() instanceof COSNumber )
- {
- length = ((COSNumber)((COSObject)streamLength).getObject()).intValue();
- }
-
- //length = -1;
- //streamLength = null;
-
- //Need to keep track of the
- out = stream.createFilteredStream( streamLength );
- String endStream = null;
- //the length is wrong in some pdf documents which means
- //that PDFBox must basically ignore it in order to be able to read
- //the most number of PDF documents. This of course is a penalty hit,
- //maybe I could implement a faster parser.
- /**if( length != -1 )
- {
- byte[] buffer = new byte[1024];
- int amountRead = 0;
- int totalAmountRead = 0;
- while( amountRead != -1 && totalAmountRead < length )
- {
- int maxAmountToRead = Math.min(buffer.length, (int)(length-totalAmountRead));
- amountRead = pdfSource.read(buffer,0,maxAmountToRead);
- totalAmountRead += amountRead;
- if( amountRead != -1 )
- {
- out.write( buffer, 0, amountRead );
- }
- }
- }
- else
- {**/
- readUntilEndStream( out );
- /**}*/
- skipSpaces();
- endStream = readString();
-
- if (!endStream.equals("endstream"))
- {
- readUntilEndStream( out );
- endStream = readString();
- if( !endStream.equals( "endstream" ) )
- {
- throw new IOException("expected='endstream' actual='" + endStream + "' " + pdfSource);
- }
- }
- }
- finally
- {
- if( out != null )
- {
- out.close();
- }
- }
- if( log.isDebugEnabled() )
- {
- log.debug("parseCOSStream() done" );
- }
- return stream;
- }
-
- private void readUntilEndStream( OutputStream out ) throws IOException
- {
- int currentIndex = 0;
- int byteRead = 0;
- //this is the additional bytes buffered but not written
- int additionalBytes=0;
- byte[] buffer = new byte[ENDSTREAM.length+additionalBytes];
- int writeIndex = 0;
- while(!cmpCircularBuffer( buffer, currentIndex, ENDSTREAM ) && byteRead != -1 )
- {
- writeIndex = currentIndex - buffer.length;
- if( writeIndex >= 0 )
- {
- out.write( buffer[writeIndex%buffer.length] );
- }
- byteRead = pdfSource.read();
- buffer[currentIndex%buffer.length] = (byte)byteRead;
- currentIndex++;
- }
-
- //we want to ignore the end of the line data when reading a stream
- //so will make an attempt to ignore it.
- /*writeIndex = currentIndex - buffer.length;
- if( buffer[writeIndex%buffer.length] == 13 &&
- buffer[(writeIndex+1)%buffer.length] == 10 )
- {
- //then ignore the newline before the endstream
- }
- else if( buffer[(writeIndex+1)%buffer.length] == 10 )
- {
- //Then first byte is data, second byte is newline
- out.write( buffer[writeIndex%buffer.length] );
- }
- else
- {
- out.write( buffer[writeIndex%buffer.length] );
- out.write( buffer[(writeIndex+1)%buffer.length] );
- }*/
-
- /**
- * Old way of handling newlines before endstream
- for( int i=0; i<additionalBytes; i++ )
- {
- writeIndex = currentIndex - buffer.length;
- if( writeIndex >=0 &&
- //buffer[writeIndex%buffer.length] != 10 &&
- buffer[writeIndex%buffer.length] != 13 )
- {
- out.write( buffer[writeIndex%buffer.length] );
- }
- currentIndex++;
- }
- */
- pdfSource.unread( ENDSTREAM );
-
- }
-
- /**
- * This basically checks to see if the next compareTo.length bytes of the
- * buffer match the compareTo byte array.
- */
- private boolean cmpCircularBuffer( byte[] buffer, int currentIndex, byte[] compareTo )
- {
- int cmpLen = compareTo.length;
- int buflen = buffer.length;
- boolean match = true;
- int off = currentIndex-cmpLen;
- if( off < 0 )
- {
- match = false;
- }
- for( int i=0; match && i<cmpLen; ++i )
- {
- match = buffer[(off+i)%buflen] == compareTo[i];
- }
- return match;
- }
-
- /**
- * This will parse a PDF string.
- *
- * @return The parsed PDF string.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- protected COSString parseCOSString() throws IOException
- {
- if( log.isDebugEnabled() )
- {
- log.debug("parseCOSString() " + pdfSource );
- }
- char nextChar = (char)pdfSource.read();
- COSString retval = new COSString();
- char openBrace;
- char closeBrace;
- if( nextChar == '(' )
- {
- openBrace = '(';
- closeBrace = ')';
- }
- else if( nextChar == '<' )
- {
- openBrace = '<';
- closeBrace = '>';
- }
- else
- {
- throw new IOException( "parseCOSString string should start with '(' or '<' and not '" +
- nextChar + "' " + pdfSource );
- }
-
- //This is the number of braces read
- //
- int braces = 1;
- int c = pdfSource.read();
- while( braces > 0 && c != -1)
- {
- char ch = (char)c;
- int nextc = -2; // not yet read
- //if( log.isDebugEnabled() )
- //{
- // log.debug( "Parsing COSString character '" + c + "' code=" + (int)c );
- //}
-
- if(ch == closeBrace)
- {
- braces--;
- byte[] nextThreeBytes = new byte[3];
- int amountRead = pdfSource.read(nextThreeBytes);
-
- //lets handle the special case seen in Bull River Rules and Regulations.pdf
- //The dictionary looks like this
- // 2 0 obj
- // <<
- // /Type /Info
- // /Creator (PaperPort http://www.scansoft.com)
- // /Producer (sspdflib 1.0 http://www.scansoft.com)
- // /Title ( (5)
- // /Author ()
- // /Subject ()
- //
- // Notice the /Title, the braces are not even but they should
- // be. So lets assume that if we encounter an this scenario
- // <end_brace><new_line><opening_slash> then that
- // means that there is an error in the pdf and assume that
- // was the end of the document.
- if( amountRead == 3 )
- {
- if( nextThreeBytes[0] == 0x0d &&
- nextThreeBytes[1] == 0x0a &&
- nextThreeBytes[2] == 0x2f )
- {
- braces = 0;
- }
- }
- pdfSource.unread( nextThreeBytes, 0, amountRead );
- if( braces != 0 )
- {
- retval.append( ch );
- }
- }
- else if( ch == openBrace )
- {
- braces++;
- retval.append( ch );
- }
- else if( ch == '\\' )
- {
- //patched by ram
- char next = (char)pdfSource.read();
- switch(next)
- {
- case 'n':
- retval.append( '\n' );
- break;
- case 'r':
- retval.append( '\r' );
- break;
- case 't':
- retval.append( '\t' );
- break;
- case 'b':
- retval.append( '\b' );
- break;
- case 'f':
- retval.append( '\f' );
- break;
- case '(':
- case ')':
- case '\\':
- retval.append( next );
- break;
- case 10:
- case 13:
- //this is a break in the line so ignore it and the newline and continue
- c = pdfSource.read();
- while( isEOL(c) && c != -1)
- {
- c = pdfSource.read();
- }
- nextc = c;
- break;
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- {
- StringBuffer octal = new StringBuffer();
- octal.append( next );
- c = pdfSource.read();
- char digit = (char)c;
- if( digit >= '0' && digit <= '7' )
- {
- octal.append( digit );
- c = pdfSource.read();
- digit = (char)c;
- if( digit >= '0' && digit <= '7' )
- {
- octal.append( digit );
- }
- else
- {
- nextc = c;
- }
- }
- else
- {
- nextc = c;
- }
-
- int character = 0;
- try
- {
- character = Integer.parseInt( octal.toString(), 8 );
- }
- catch( NumberFormatException e )
- {
- throw new IOException( "Error: Expected octal character, actual='" + octal + "'" );
- }
- retval.append( character );
- break;
- }
- default:
- {
- retval.append( '\\' );
- retval.append( next );
- //another ficken problem with PDF's, sometimes the \ doesn't really
- //mean escape like the PDF spec says it does, sometimes is should be literal
- //which is what we will assume here.
- //throw new IOException( "Unexpected break sequence '" + next + "' " + pdfSource );
- }
- }
- }
- else
- {
- if( openBrace == '<' )
- {
- if( isHexDigit(ch) )
- {
- retval.append( ch );
- }
- }
- else
- {
- retval.append( ch );
- }
- }
- if (nextc != -2)
- {
- c = nextc;
- }
- else
- {
- c = pdfSource.read();
- }
- }
- if (c != -1)
- {
- pdfSource.unread(c);
- }
- if( openBrace == '<' )
- {
- retval = COSString.createFromHexString( retval.getString() );
- }
- if( log.isDebugEnabled() )
- {
- log.debug("parseCOSString() done parsed=" + retval );
- }
- return retval;
- }
-
- /**
- * This will parse a PDF array object.
- *
- * @return The parsed PDF array.
- *
- * @throws IOException If there is an error parsing the stream.
- */
- protected COSArray parseCOSArray() throws IOException
- {
- if( log.isDebugEnabled() )
- {
- log.debug("parseCOSArray() " + pdfSource );
- }
- char ch = (char)pdfSource.read();
- if( ch != '[')
- {
- throw new IOException( "expected='[' actual='" + ch + "'" );
- }
- COSArray po = new COSArray();
- COSBase pbo = null;
- skipSpaces();
- int i = 0;
- while( ((i = pdfSource.peek()) > 0) && ((char)i != ']') )
- {
- pbo = parseDirObject();
- if( pbo instanceof COSObject )
- {
- COSInteger genNumber = (COSInteger)po.remove( po.size() -1 );
- COSInteger number = (COSInteger)po.remove( po.size() -1 );
- COSObjectKey key = new COSObjectKey(number.intValue(), genNumber.intValue());
- pbo = document.getObjectFromPool(key);
- }
- if( pbo != null )
- {
- po.add( pbo );
- }
- else
- {
- //it could be a bad object in the array which is just skipped
- }
- skipSpaces();
- }
- pdfSource.read(); //read ']'
- skipSpaces();
- if( log.isDebugEnabled() )
- {
- log.debug("parseCOSArray() done peek='" + (char)pdfSource.peek() + "'" );
- }
- return po;
- }
-
- /**
- * Determine if a character terminates a PDF name.
- *
- * @param ch The character
- * @return <code>true</code> if the character terminates a PDF name, otherwise <code>false</code>.
- */
- protected boolean isEndOfName(char ch)
- {
- return (ch == ' ' || ch == 13 || ch == 10 || ch == 9 || ch == '>' || ch == '<'
- || ch == '[' || ch =='/' || ch ==']' || ch ==')' || ch =='(' ||
- ch == -1 //EOF
- );
- }
-
- /**
- * This will parse a PDF name from the stream.
- *
- * @return The parsed PDF name.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- protected COSName parseCOSName() throws IOException
- {
- if( log.isDebugEnabled() )
- {
- log.debug("parseCOSName() " + pdfSource );
- }
- COSName retval = null;
- int c = pdfSource.read();
- if( (char)c != '/')
- {
- throw new IOException("expected='/' actual='" + (char)c + "'-" + c + " " + pdfSource );
- }
- // costruisce il nome
- StringBuffer buffer = new StringBuffer();
- c = pdfSource.read();
- while( c != -1 )
- {
- char ch = (char)c;
- if(ch == '#')
- {
- char ch1 = (char)pdfSource.read();
- char ch2 = (char)pdfSource.read();
-
- // Prior to PDF v1.2, the # was not a special character. Also,
- // it has been observed that various PDF tools do not follow the
- // spec with respect to the # escape, even though they report
- // PDF versions of 1.2 or later. The solution here is that we
- // interpret the # as an escape only when it is followed by two
- // valid hex digits.
- //
- if (isHexDigit(ch1) && isHexDigit(ch2))
- {
- String hex = "" + ch1 + ch2;
- try
- {
- buffer.append( (char) Integer.parseInt(hex, 16));
- }
- catch (NumberFormatException e)
- {
- if( log.isDebugEnabled() )
- {
- log.debug("isHexDigit(ch1)=" + isHexDigit(ch1) + ", isHexDigit(ch2)=" + isHexDigit(ch2));
- }
- throw new IOException("Error: expected hex number, actual='" + hex + "'");
- }
- c = pdfSource.read();
- }
- else
- {
- pdfSource.unread(ch2);
- c = ch1;
- buffer.append( ch );
- }
- }
- else if (isEndOfName(ch))
- {
- break;
- }
- else
- {
- buffer.append( ch );
- c = pdfSource.read();
- }
- }
- if (c != -1)
- {
- pdfSource.unread(c);
- }
- retval = COSName.getPDFName( buffer.toString() );
- return retval;
- }
-
- /**
- * This will parse a boolean object from the stream.
- *
- * @return The parsed boolean object.
- *
- * @throws IOException If an IO error occurs during parsing.
- */
- protected COSBoolean parseBoolean() throws IOException
- {
- COSBoolean retval = null;
- char c = (char)pdfSource.peek();
- if( c == 't' )
- {
- byte[] trueArray = new byte[ 4 ];
- int amountRead = pdfSource.read( trueArray, 0, 4 );
- String trueString = new String( trueArray, 0, amountRead );
- if( !trueString.equals( "true" ) )
- {
- throw new IOException( "Error parsing boolean: expected='true' actual='" + trueString + "'" );
- }
- else
- {
- retval = COSBoolean.TRUE;
- }
- }
- else if( c == 'f' )
- {
- byte[] falseArray = new byte[ 5 ];
- int amountRead = pdfSource.read( falseArray, 0, 5 );
- String falseString = new String( falseArray, 0, amountRead );
- if( !falseString.equals( "false" ) )
- {
- throw new IOException( "Error parsing boolean: expected='true' actual='" + falseString + "'" );
- }
- else
- {
- retval = COSBoolean.FALSE;
- }
- }
- else
- {
- throw new IOException( "Error parsing boolean expected='t or f' actual='" + c + "'" );
- }
- return retval;
- }
-
- /**
- * This will parse a directory object from the stream.
- *
- * @return The parsed object.
- *
- * @throws IOException If there is an error during parsing.
- */
- protected COSBase parseDirObject() throws IOException
- {
- if( log.isDebugEnabled() )
- {
- log.debug("parseDirObject() " + pdfSource );
- }
- COSBase retval = null;
-
- skipSpaces();
- int nextByte = pdfSource.peek();
- char c = (char)nextByte;
- switch(c)
- {
- case '<':
- {
- int leftBracket = pdfSource.read();//pull off first left bracket
- c = (char)pdfSource.peek(); //check for second left bracket
- pdfSource.unread( leftBracket );
- if(c == '<')
- {
-
- retval = parseCOSDictionary();
- skipSpaces();
- }
- else
- {
- retval = parseCOSString();
- }
- break;
- }
- case '[': // array
- {
- retval = parseCOSArray();
- break;
- }
- case '(':
- retval = parseCOSString();
- break;
- case '/': // name
- retval = parseCOSName();
- break;
- case 'n': // null
- {
- String nullString = readString();
- if( !nullString.equals( "null") )
- {
- throw new IOException("Expected='null' actual='" + nullString + "'");
- }
- retval = COSNull.NULL;
- break;
- }
- case 't':
- {
- byte[] trueBytes = new byte[4];
- int amountRead = pdfSource.read( trueBytes, 0, 4 );
- String trueString = new String( trueBytes, 0, amountRead );
- if( trueString.equals( "true" ) )
- {
- retval = COSBoolean.TRUE;
- }
- else
- {
- throw new IOException( "expected true actual='" + trueString + "' " + pdfSource );
- }
- break;
- }
- case 'f':
- {
- byte[] falseBytes = new byte[5];
- int amountRead = pdfSource.read( falseBytes, 0, 5 );
- String falseString = new String( falseBytes, 0, amountRead );
- if( falseString.equals( "false" ) )
- {
- retval = COSBoolean.FALSE;
- }
- else
- {
- throw new IOException( "expected false actual='" + falseString + "' " + pdfSource );
- }
- break;
- }
- case 'R':
- pdfSource.read();
- retval = new COSObject(null);
- break;
- case (char)-1:
- return null;
- default:
- {
- if( Character.isDigit(c) || c == '-' || c == '+' || c == '.')
- {
- StringBuffer buf = new StringBuffer();
- int ic = pdfSource.read();
- c = (char)ic;
- while( Character.isDigit( c )||
- c == '-' ||
- c == '+' ||
- c == '.' ||
- c == 'E' ||
- c == 'e' )
- {
- buf.append( c );
- ic = pdfSource.read();
- c = (char)ic;
- }
- if( ic != -1 )
- {
- pdfSource.unread( ic );
- }
- retval = COSNumber.get( buf.toString() );
- }
- else
- {
- //This is not suppose to happen, but we will allow for it
- //so we are more compatible with POS writers that don't
- //follow the spec
- String badString = readString();
- //throw new IOException( "Unknown dir object c='" + c +
- //"' peek='" + (char)pdfSource.peek() + "' " + pdfSource );
- if( log.isDebugEnabled() )
- {
- log.debug("parseDirObject() bad DIR object found. ignoring: '" + badString + "'");
- }
- if( badString == null || badString.length() == 0 )
- {
- int peek = pdfSource.peek();
- // we can end up in an infinite loop otherwise
- throw new IOException( "Unknown dir object c='" + c +
- "' cInt=" + (int)c + " peek='" + (char)peek + "' peekInt=" + peek + " " + pdfSource );
- }
-
- }
- }
- }
- if( log.isDebugEnabled() )
- {
- log.debug("parseDirObject() done retval=" +retval );
- }
- return retval;
- }
-
- /**
- * This will read the next string from the stream.
- *
- * @return The string that was read from the stream.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- protected String readString() throws IOException
- {
- skipSpaces();
- StringBuffer buffer = new StringBuffer();
- int c = pdfSource.read();
- while( !isEndOfName((char)c) && !isClosing(c) && c != -1 )
- {
- buffer.append( (char)c );
- c = pdfSource.read();
- }
- if (c != -1)
- {
- pdfSource.unread(c);
- }
- return buffer.toString();
- }
-
- /**
- * This will read bytes until the end of line marker occurs.
- *
- * @param theString The next expected string in the stream.
- *
- * @return The characters between the current position and the end of the line.
- *
- * @throws IOException If there is an error reading from the stream or theString does not match what was read.
- */
- protected String readExpectedString( String theString ) throws IOException
- {
- int c = pdfSource.read();
- while( isWhitespace(c) && c != -1)
- {
- c = pdfSource.read();
- }
- StringBuffer buffer = new StringBuffer( theString.length() );
- int charsRead = 0;
- while( !isEOL(c) && c != -1 && charsRead < theString.length() )
- {
- char next = (char)c;
- buffer.append( next );
- if( theString.charAt( charsRead ) == next )
- {
- charsRead++;
- }
- else
- {
- throw new IOException( "Error: Expected to read '" + theString +
- "' instead started reading '" +buffer.toString() + "'" );
- }
- c = pdfSource.read();
- }
- while( isEOL(c) && c != -1 )
- {
- c = pdfSource.read();
- }
- if (c != -1)
- {
- pdfSource.unread(c);
- }
- return buffer.toString();
- }
-
- /**
- * This will read the next string from the stream up to a certain length.
- *
- * @param length The length to stop reading at.
- *
- * @return The string that was read from the stream of length 0 to length.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- protected String readString( int length ) throws IOException
- {
- skipSpaces();
-
- int c = pdfSource.read();
-
- //average string size is around 2 and the normal string buffer size is
- //about 16 so lets save some space.
- StringBuffer buffer = new StringBuffer(length);
- while( !isWhitespace(c) && !isClosing(c) && c != -1 && buffer.length() < length &&
- c != '[' &&
- c != '<' &&
- c != '(' &&
- c != '/' )
- {
- buffer.append( (char)c );
- c = pdfSource.read();
- }
- if (c != -1)
- {
- pdfSource.unread(c);
- }
- return buffer.toString();
- }
-
- /**
- * This will tell if the next character is a closing brace( close of PDF array ).
- *
- * @return true if the next byte is ']', false otherwise.
- *
- * @throws IOException If an IO error occurs.
- */
- protected boolean isClosing() throws IOException
- {
- return isClosing(pdfSource.peek());
- }
-
- /**
- * This will tell if the next character is a closing brace( close of PDF array ).
- *
- * @param c The character to check against end of line
- * @return true if the next byte is ']', false otherwise.
- */
- protected boolean isClosing(int c)
- {
- return c == ']';
- }
-
- /**
- * This will read bytes until the end of line marker occurs.
- *
- * @return The characters between the current position and the end of the line.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- protected String readLine() throws IOException
- {
- int c = pdfSource.read();
- while(isWhitespace(c) && c != -1)
- {
- c = pdfSource.read();
- }
- StringBuffer buffer = new StringBuffer( 11 );
-
- while( !isEOL(c) && c != -1 )
- {
- buffer.append( (char)c );
- c = pdfSource.read();
- }
- while( isEOL(c) && c != -1 )
- {
- c = pdfSource.read();
- }
- if (c != -1)
- {
- pdfSource.unread(c);
- }
- return buffer.toString();
- }
-
- /**
- * This will tell if the next byte to be read is an end of line byte.
- *
- * @return true if the next byte is 0x0A or 0x0D.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- protected boolean isEOL() throws IOException
- {
- return isEOL(pdfSource.peek());
- }
-
- /**
- * This will tell if the next byte to be read is an end of line byte.
- *
- * @param c The character to check against end of line
- * @return true if the next byte is 0x0A or 0x0D.
- */
- protected boolean isEOL(int c)
- {
- return c == 10 || c == 13;
- }
-
- /**
- * This will tell if the next byte is whitespace or not.
- *
- * @return true if the next byte in the stream is a whitespace character.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- protected boolean isWhitespace() throws IOException
- {
- return isWhitespace( pdfSource.peek() );
- }
-
- /**
- * This will tell if the next byte is whitespace or not.
- *
- * @param c The character to check against whitespace
- *
- * @return true if the next byte in the stream is a whitespace character.
- */
- protected boolean isWhitespace( int c )
- {
- return c == 0 || c == 9 || c == 12 || c == 10
- || c == 13 || c == 32;
- }
-
- /**
- * This will skip all spaces and comments that are present.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- protected void skipSpaces() throws IOException
- {
- //log( "skipSpaces() " + pdfSource );
- int c = pdfSource.read();
- // identical to, but faster as: isWhiteSpace(c) || c == 37
- while(c == 0 || c == 9 || c == 12 || c == 10
- || c == 13 || c == 32 || c == 37)//37 is the % character, a comment
- {
- if ( c == 37 )
- {
- // skip past the comment section
- c = pdfSource.read();
- while(!isEOL(c) && c != -1)
- {
- c = pdfSource.read();
- }
- }
- else
- {
- c = pdfSource.read();
- }
- }
- if (c != -1)
- {
- pdfSource.unread(c);
- }
- //log( "skipSpaces() done peek='" + (char)pdfSource.peek() + "'" );
- }
-
- /**
- * this will compare two byte arrays.
- *
- * @param first The first byte array to compare.
- * @param second The second byte array to compare.
- *
- * @return true if both arrays are the same AND forall i : first[i] = second[i]
- */
- private boolean cmpArray( byte[] first, byte[] second )
- {
- return cmpArray( first, 0, second );
- }
-
- /**
- * This will compare two arrays for equality.
- *
- * @param first The first array to compare.
- * @param firstOffset The first byte to start comparing.
- * @param second The second array to compare.
- */
- private boolean cmpArray( byte[] first, int firstOffset, byte[] second )
- {
- boolean retval = true;
- if( first.length-firstOffset >= second.length )
- {
- int arrayLength = second.length;
- for( int i =0; i<arrayLength && retval; i++ )
- {
- retval = retval && first[ firstOffset + i ] == second[ i ];
- }
- }
- else
- {
- retval = false;
- }
- return retval;
- }
-
- /**
- * This will read an integer from the stream.
- *
- * @return The integer that was read from the stream.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- protected int readInt() throws IOException
- {
- skipSpaces();
- int retval = 0;
-
- int lastByte = 0;
- StringBuffer intBuffer = new StringBuffer();
- while( (lastByte = pdfSource.read() ) != 32 &&
- lastByte != 10 &&
- lastByte != 13 &&
- lastByte != 0 && //See sourceforge bug 853328
- lastByte != -1 )
- {
- intBuffer.append( (char)lastByte );
- }
- try
- {
- retval = Integer.parseInt( intBuffer.toString() );
- }
- catch( NumberFormatException e )
- {
- throw new IOException( "Error: Expected an integer type, actual='" + intBuffer + "'" );
- }
- return retval;
- }
-
- /**
- * This will add an xref.
- *
- * @param xref The xref to add.
- */
- public void addXref( PDFXref xref )
- {
- xrefs.add(xref);
- }
-
- /**
- * This will get all of the xrefs.
- *
- * @return A list of all xrefs.
- */
- public List getXrefs()
- {
- return xrefs;
- }
-
- /**
- * This will set the xrefs for this parser.
- *
- * @param newXrefs The xrefs for this parser.
- */
- private void setXrefs( List newXrefs )
- {
- xrefs = newXrefs;
- }
-} \ No newline at end of file
diff --git a/src/main/java/org/pdfbox/pdfparser/PDFObjectStreamParser.java b/src/main/java/org/pdfbox/pdfparser/PDFObjectStreamParser.java
deleted file mode 100644
index 6fb7563..0000000
--- a/src/main/java/org/pdfbox/pdfparser/PDFObjectStreamParser.java
+++ /dev/null
@@ -1,137 +0,0 @@
-/**
- * Copyright (c) 2003-2004, www.pdfbox.org
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * 3. Neither the name of pdfbox; nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * http://www.pdfbox.org
- *
- */
-package org.pdfbox.pdfparser;
-
-import java.io.IOException;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.pdfbox.cos.COSBase;
-import org.pdfbox.cos.COSDocument;
-import org.pdfbox.cos.COSInteger;
-import org.pdfbox.cos.COSObject;
-import org.pdfbox.cos.COSStream;
-
-
-import org.apache.log4j.Logger;
-
-/**
- * This will parse a PDF 1.5 object stream and extract all of the objects from the stream.
- *
- * @author Ben Litchfield (ben@csh.rit.edu)
- * @version $Revision: 1.4 $
- */
-public class PDFObjectStreamParser extends BaseParser
-{
- private static Logger log = Logger.getLogger(PDFObjectStreamParser.class);
- private List streamObjects = null;
- private List objectNumbers = null;
- private COSStream stream;
-
- /**
- * Constructor.
- *
- * @param strm The stream to parse.
- * @param doc The document for the current parsing.
- *
- * @throws IOException If there is an error initializing the stream.
- */
- public PDFObjectStreamParser( COSStream strm, COSDocument doc ) throws IOException
- {
- super( strm.getUnfilteredStream() );
- setDocument( doc );
- stream = strm;
- }
-
- /**
- * This will parse the tokens in the stream. This will close the
- * stream when it is finished parsing.
- *
- * @throws IOException If there is an error while parsing the stream.
- */
- public void parse() throws IOException
- {
- if( log.isDebugEnabled() )
- {
- log.debug( "parse() start" );
- }
-
- try
- {
- //need to first parse the header.
- int numberOfObjects = stream.getInt( "N" );
- objectNumbers = new ArrayList( numberOfObjects );
- streamObjects = new ArrayList( numberOfObjects );
- for( int i=0; i<numberOfObjects; i++ )
- {
- int objectNumber = readInt();
- int offset = readInt();
- if( log.isDebugEnabled() )
- {
- log.debug( "objNum:" + objectNumber + " offset:" + offset );
- }
- objectNumbers.add( new Integer( objectNumber ) );
- }
- COSObject object = null;
- COSBase cosObject = null;
- int objectCounter = 0;
- while( (cosObject = parseDirObject()) != null )
- {
- object = new COSObject(cosObject);
- object.setGenerationNumber( COSInteger.ZERO );
- COSInteger objNum =
- new COSInteger( ((Integer)objectNumbers.get( objectCounter)).intValue() );
- object.setObjectNumber( objNum );
- streamObjects.add( object );
- objectCounter++;
- }
- }
- finally
- {
- pdfSource.close();
- }
- if( log.isDebugEnabled() )
- {
- log.debug( "parse() end" );
- }
- }
-
- /**
- * This will get the objects that were parsed from the stream.
- *
- * @return All of the objects in the stream.
- */
- public List getObjects()
- {
- return streamObjects;
- }
-} \ No newline at end of file
diff --git a/src/main/java/org/pdfbox/pdfparser/PDFParser.java b/src/main/java/org/pdfbox/pdfparser/PDFParser.java
deleted file mode 100644
index d655ef1..0000000
--- a/src/main/java/org/pdfbox/pdfparser/PDFParser.java
+++ /dev/null
@@ -1,557 +0,0 @@
-/**
- * Copyright (c) 2003-2005, www.pdfbox.org
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * 3. Neither the name of pdfbox; nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * http://www.pdfbox.org
- *
- */
-package org.pdfbox.pdfparser;
-
-import java.io.File;
-import java.io.RandomAccessFile;
-import java.io.InputStream;
-import java.io.IOException;
-
-import java.util.Iterator;
-
-import org.pdfbox.cos.COSBase;
-import org.pdfbox.cos.COSDictionary;
-import org.pdfbox.cos.COSDocument;
-import org.pdfbox.cos.COSObject;
-import org.pdfbox.cos.COSStream;
-import org.pdfbox.exceptions.WrappedIOException;
-
-import org.pdfbox.pdmodel.PDDocument;
-
-import org.pdfbox.pdmodel.fdf.FDFDocument;
-
-import org.pdfbox.persistence.util.COSObjectKey;
-
-import org.apache.log4j.Logger;
-
-/**
- * This class will handle the parsing of the PDF document.
- *
- * @author Ben Litchfield (ben@benlitchfield.com)
- * @version $Revision: 1.47 $
- */
-public class PDFParser extends BaseParser
-{
- private static Logger log = Logger.getLogger( PDFParser.class );
- private static final int SPACE_BYTE = 32;
-
- private static final String PDF_HEADER = "%PDF-";
- private COSDocument document;
-
- /**
- * Temp file directory.
- */
- private File tempDirectory = new File( System.getProperty( "java.io.tmpdir" ) );
-
- private RandomAccessFile raf = null;
-
- /**
- * Constructor.
- *
- * @param input The input stream that contains the PDF document.
- *
- * @throws IOException If there is an error initializing the stream.
- */
- public PDFParser( InputStream input ) throws IOException
- {
- this(input, null);
- }
-
- /**
- * Constructor to allow control over RandomAccessFile.
- * @param input The input stream that contains the PDF document.
- * @param rafi The RandomAccessFile to be used in internal COSDocument
- *
- * @throws IOException If there is an error initializing the stream.
- */
- public PDFParser(InputStream input, RandomAccessFile rafi)
- throws IOException
- {
- super(input);
- this.raf = rafi;
- }
-
- /**
- * This is the directory where pdfbox will create a temporary file
- * for storing pdf document stream in. By default this directory will
- * be the value of the system property java.io.tmpdir.
- *
- * @param tmpDir The directory to create scratch files needed to store
- * pdf document streams.
- */
- public void setTempDirectory( File tmpDir )
- {
- tempDirectory = tmpDir;
- }
-
- /**
- * This will prase the stream and create the PDF document. This will close
- * the stream when it is done parsing.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- public void parse() throws IOException
- {
- try
- {
- if ( raf == null )
- {
- document = new COSDocument( tempDirectory );
- }
- else
- {
- document = new COSDocument( raf );
- }
- setDocument( document );
- String header = readLine();
- if( log.isDebugEnabled() )
- {
- log.debug( "Header=" + header );
- }
- document.setHeaderString( header );
-
- if( header.length() < PDF_HEADER.length()+1 )
- {
- throw new IOException( "Error: Header is corrupt '" + header + "'" );
- }
-
- //sometimes there are some garbage bytes in the header before the header
- //actually starts, so lets try to find the header first.
- int headerStart = header.indexOf( PDF_HEADER );
-
- //greater than zero because if it is zero then
- //there is no point of trimming
- if( headerStart > 0 )
- {
- //trim off any leading characters
- header = header.substring( headerStart, header.length() );
- }
-
- try
- {
- float pdfVersion = Float.parseFloat(
- header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER.length()+3) ) );
- document.setVersion( pdfVersion );
- }
- catch( NumberFormatException e )
- {
- throw new IOException( "Error getting pdf version:" + e );
- }
-
- skipHeaderFillBytes();
-
-
- Object nextObject;
- boolean wasLastParsedObjectAnXref = false;
- try
- {
- while( (nextObject = parseObject()) != null )
- {
- if( nextObject instanceof PDFXref )
- {
- PDFXref xref = (PDFXref)nextObject;
- addXref(xref);
- wasLastParsedObjectAnXref = true;
- }
- else
- {
- wasLastParsedObjectAnXref = false;
- }
- skipSpaces();
- }
- if( document.getTrailer() == null )
- {
- COSDictionary trailer = new COSDictionary();
- Iterator xrefIter = document.getObjectsByType( "XRef" ).iterator();
- while( xrefIter.hasNext() )
- {
- COSStream next = (COSStream)((COSObject)xrefIter.next()).getObject();
- trailer.addAll( next );
- }
- document.setTrailer( trailer );
- }
- if( !document.isEncrypted() )
- {
- document.dereferenceObjectStreams();
- }
- }
- catch( IOException e )
- {
- if( wasLastParsedObjectAnXref )
- {
- log.debug( "Skipping some garbage", e );
- //Then we assume that there is just random garbage after
- //the xref, not sure why the PDF spec allows this but it does.
- }
- else
- {
- //some other error so just pass it along
- throw e;
- }
- }
- }
- catch( Throwable t )
- {
- //so if the PDF is corrupt then close the document and clear
- //all resources to it
- if( document != null )
- {
- document.close();
- }
- if( t instanceof IOException )
- {
- throw (IOException)t;
- }
- else
- {
- throw new WrappedIOException( t );
- }
- }
- finally
- {
- pdfSource.close();
- }
- }
-
- /**
- * This will skip a header's binary fill bytes. This is in accordance to
- * PDF Specification 1.5 pg 68 section 3.4.1 "Syntax.File Structure.File Header"
- *
- * @throws IOException If there is an error reading from the stream.
- */
- protected void skipHeaderFillBytes() throws IOException
- {
- skipSpaces();
- int c = pdfSource.peek();
-
- if( !Character.isDigit( (char)c ) )
- {
- // Fill bytes conform with PDF reference (but without comment sign)
- // => skip until EOL
- readLine();
- }
- // else: no fill bytes
- }
-
- /**
- * This will get the document that was parsed. parse() must be called before this is called.
- * When you are done with this document you must call close() on it to release
- * resources.
- *
- * @return The document that was parsed.
- *
- * @throws IOException If there is an error getting the document.
- */
- public COSDocument getDocument() throws IOException
- {
- if( document == null )
- {
- throw new IOException( "You must call parse() before calling getDocument()" );
- }
- return document;
- }
-
- /**
- * This will get the PD document that was parsed. When you are done with
- * this document you must call close() on it to release resources.
- *
- * @return The document at the PD layer.
- *
- * @throws IOException If there is an error getting the document.
- */
- public PDDocument getPDDocument() throws IOException
- {
- return new PDDocument( getDocument() );
- }
-
- /**
- * This will get the FDF document that was parsed. When you are done with
- * this document you must call close() on it to release resources.
- *
- * @return The document at the PD layer.
- *
- * @throws IOException If there is an error getting the document.
- */
- public FDFDocument getFDFDocument() throws IOException
- {
- return new FDFDocument( getDocument() );
- }
-
- /**
- * This will parse a document object from the stream.
- *
- * @return The parsed object.
- *
- * @throws IOException If an IO error occurs.
- */
- private Object parseObject() throws IOException
- {
- Object object = null;
- char peekedChar = (char)pdfSource.peek();
- if( log.isDebugEnabled() )
- {
- log.debug( "PDFParser.parseObject() peek='" + peekedChar + "'" );
- }
- if( pdfSource.isEOF() )
- {
- if( log.isDebugEnabled() )
- {
- log.debug( "Skipping because of EOF" );
- //end of file we will return a null object and call it a day.
- }
- }
- else if( peekedChar == 'x' ||
- peekedChar == 't' ||
- peekedChar == 's')
- {
- //System.out.println( "parseObject() parsing xref" );
-
- //FDF documents do not always have the xref
- if( peekedChar == 'x' || peekedChar == 't' )
- {
- object = parseXrefSection();
- }
-
- //if peeked char is xref or startxref
- if( peekedChar == 'x' || peekedChar == 's')
- {
- skipSpaces();
- while( pdfSource.peek() == 'x' )
- {
- parseXrefSection();
- }
- String startxref = readString();
- if( !startxref.equals( "startxref" ) )
- {
- throw new IOException( "expected='startxref' actual='" + startxref + "' " + pdfSource );
- }
- skipSpaces();
- //read some integer that is in the stream but PDFBox doesn't use
- readInt();
- }
-
- //This MUST be readLine because readString strips out comments
- //and it will think that %% is a comment in from of the EOF
- String eof = readExpectedString( "%%EOF" );
- if( eof.indexOf( "%%EOF" )== -1 && !pdfSource.isEOF() )
- {
- throw new IOException( "expected='%%EOF' actual='" + eof + "' next=" + readString() +
- " next=" +readString() );
- }
- else if( !pdfSource.isEOF() )
- {
- //we might really be at the end of the file, there might just be some crap at the
- //end of the file.
- if( pdfSource.available() < 1000 )
- {
- //We need to determine if we are at the end of the file.
- byte[] data = new byte[ 1000 ];
-
- int amountRead = pdfSource.read( data );
- if( amountRead != -1 )
- {
- pdfSource.unread( data, 0, amountRead );
- }
- boolean atEndOfFile = true;//we assume yes unless we find another.
- for( int i=0; i<amountRead-3 && atEndOfFile; i++ )
- {
- atEndOfFile = !(data[i] == 'E' &&
- data[i+1] == 'O' &&
- data[i+2] == 'F' );
- }
- if( atEndOfFile )
- {
- while( pdfSource.read( data, 0, data.length ) != -1 )
- {
- //read until done.
- }
- }
- }
- }
- }
- else
- {
- int number;
- int genNum;
- String objectKey = null;
- try
- {
- number = readInt();
- }
- catch( IOException e )
- {
- //ok for some reason "GNU Ghostscript 5.10" puts two endobj
- //statements after an object, of course this is nonsense
- //but because we want to support as many PDFs as possible
- //we will simply try again
- number = readInt();
- }
- skipSpaces();
- genNum = readInt();
- if( log.isDebugEnabled() )
- {
- log.debug( "Parsing object (" + number + "," + genNum + ")" );
- }
-
- objectKey = readString( 3 );
- //System.out.println( "parseObject() num=" + number + " genNumber=" + genNum + " key='" + objectKey + "'" );
- if( !objectKey.equals( "obj" ) )
- {
- throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource );
- }
-
- skipSpaces();
- COSBase pb = parseDirObject();
- String endObjectKey = readString();
- if( endObjectKey.equals( "stream" ) )
- {
- pdfSource.unread( endObjectKey.getBytes() );
- pdfSource.unread( ' ' );
- if( pb instanceof COSDictionary )
- {
- pb = parseCOSStream( (COSDictionary)pb, getDocument().getScratchFile() );
- }
- else
- {
- // this is not legal
- // the combination of a dict and the stream/endstream forms a complete stream object
- throw new IOException("stream not preceded by dictionary");
- }
- endObjectKey = readString();
- }
- COSObjectKey key = new COSObjectKey( number, genNum );
- COSObject pdfObject = document.getObjectFromPool( key );
- object = pdfObject;
- pdfObject.setObject(pb);
-
- if( !endObjectKey.equals( "endobj" ) )
- {
- if( !pdfSource.isEOF() )
- {
- try
- {
- //It is possible that the endobj is missing, there
- //are several PDFs out there that do that so skip it and move on.
- Float.parseFloat( endObjectKey );
- pdfSource.unread( SPACE_BYTE );
- pdfSource.unread( endObjectKey.getBytes() );
- if( log.isDebugEnabled() )
- {
- log.debug( "Missing endobj, found '" + endObjectKey +
- "' instead, assuming that endobj is not present and will continue parsing." );
- }
- }
- catch( NumberFormatException e )
- {
- //we will try again incase there was some garbage which
- //some writers will leave behind.
- String secondEndObjectKey = readString();
- if( !secondEndObjectKey.equals( "endobj" ) )
- {
- throw new IOException("expected='endobj' firstReadAttempt='" + endObjectKey + "' " +
- "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource);
- }
- }
- }
- }
- skipSpaces();
-
- }
- //System.out.println( "parsed=" + object );
- return object;
- }
-
-
- /**
- * This will parse the xref table and trailers from the stream.
- *
- * @return a new PDFXref
- *
- * @throws IOException If an IO error occurs.
- */
- protected PDFXref parseXrefSection() throws IOException
- {
- int[] params = new int[2];
- parseXrefTable(params);
- parseTrailer();
-
- return new PDFXref(params[0], params[1]);
- }
-
- /**
- * This will parse the xref table from the stream.
- *
- * It stores the starting object number and the count
- *
- * @param params The start and count parameters
- *
- * @throws IOException If an IO error occurs.
- */
- protected void parseXrefTable(int[] params) throws IOException
- {
- String nextLine = null;
-
- nextLine = readLine();
- if( nextLine.equals( "xref" ) )
- {
- params[0] = readInt();
- params[1] = readInt();
- nextLine = readString();
- }
- skipSpaces();
- while( !nextLine.equals( "trailer" ) && !pdfSource.isEOF() && !isEndOfName((char)pdfSource.peek()))
- {
- //skip past all the xref entries.
- nextLine = readString();
- skipSpaces();
- }
- skipSpaces();
- }
-
- private void parseTrailer() throws IOException
- {
- COSDictionary parsedTrailer = parseCOSDictionary();
- COSDictionary docTrailer = document.getTrailer();
- if( log.isDebugEnabled() )
- {
- log.debug( "parsedTrailer=" + parsedTrailer );
- log.debug( "docTrailer=" + docTrailer );
- }
- if( docTrailer == null )
- {
- document.setTrailer( parsedTrailer );
- }
- else
- {
- docTrailer.addAll( parsedTrailer );
- }
- }
-}
diff --git a/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java b/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
deleted file mode 100644
index d59c5a4..0000000
--- a/src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java
+++ /dev/null
@@ -1,403 +0,0 @@
-/**
- * Copyright (c) 2003-2004, www.pdfbox.org
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * 3. Neither the name of pdfbox; nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * http://www.pdfbox.org
- *
- */
-package org.pdfbox.pdfparser;
-
-import java.io.ByteArrayOutputStream;
-import java.io.InputStream;
-import java.io.IOException;
-import java.io.RandomAccessFile;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.pdfbox.cos.COSBase;
-import org.pdfbox.cos.COSBoolean;
-import org.pdfbox.cos.COSDictionary;
-import org.pdfbox.cos.COSName;
-import org.pdfbox.cos.COSNull;
-import org.pdfbox.cos.COSNumber;
-import org.pdfbox.cos.COSObject;
-import org.pdfbox.cos.COSStream;
-
-import org.pdfbox.util.PDFOperator;
-import org.pdfbox.util.ImageParameters;
-
-import org.apache.log4j.Logger;
-
-/**
- * This will parse a PDF byte stream and extract operands and such.
- *
- * @author Ben Litchfield (ben@csh.rit.edu)
- * @version $Revision: 1.29 $
- */
-public class PDFStreamParser extends BaseParser
-{
- private static Logger log = Logger.getLogger(PDFStreamParser.class);
- private List streamObjects = new ArrayList( 100 );
- private RandomAccessFile file;
- private PDFOperator lastBIToken = null;
-
- /**
- * Constructor that takes a stream to parse.
- *
- * @param stream The stream to read data from.
- * @param raf The random access file.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- public PDFStreamParser( InputStream stream, RandomAccessFile raf ) throws IOException
- {
- super( stream );
- file = raf;
- }
-
- /**
- * Constructor.
- *
- * @param stream The stream to parse.
- *
- * @throws IOException If there is an error initializing the stream.
- */
- public PDFStreamParser( COSStream stream ) throws IOException
- {
- this( stream.getUnfilteredStream(), stream.getScratchFile() );
- }
-
- /**
- * This will parse the tokens in the stream. This will close the
- * stream when it is finished parsing.
- *
- * @throws IOException If there is an error while parsing the stream.
- */
- public void parse() throws IOException
- {
- if( log.isDebugEnabled() )
- {
- log.debug( "parse() start" );
- }
-
- try
- {
- Object token = null;
- while( (token = parseNextToken()) != null )
- {
- streamObjects.add( token );
- }
- }
- finally
- {
- pdfSource.close();
- }
- if( log.isDebugEnabled() )
- {
- log.debug( "parse() end" );
- }
- }
-
- /**
- * This will get the tokens that were parsed from the stream.
- *
- * @return All of the tokens in the stream.
- */
- public List getTokens()
- {
- return streamObjects;
- }
-
- /**
- * This will parse the next token in the stream.
- *
- * @return The next token in the stream or null if there are no more tokens in the stream.
- *
- * @throws IOException If an io error occurs while parsing the stream.
- */
- private Object parseNextToken() throws IOException
- {
- if( log.isDebugEnabled() )
- {
- log.debug( "parseNextToken() start" );
- }
- Object retval = null;
-
- skipSpaces();
- int nextByte = pdfSource.peek();
- if( ((byte)nextByte) == -1 )
- {
- return null;
- }
- char c = (char)nextByte;
- switch(c)
- {
- case '<':
- {
- int leftBracket = pdfSource.read();//pull off first left bracket
- c = (char)pdfSource.peek(); //check for second left bracket
- pdfSource.unread( leftBracket ); //put back first bracket
- if(c == '<')
- {
-
- COSDictionary pod = parseCOSDictionary();
- skipSpaces();
- if((char)pdfSource.peek() == 's')
- {
- retval = parseCOSStream( pod, file );
- }
- else
- {
- retval = pod;
- }
- }
- else
- {
- retval = parseCOSString();
- }
- break;
- }
- case '[': // array
- {
- retval = parseCOSArray();
- break;
- }
- case '(': // string
- retval = parseCOSString();
- break;
- case '/': // name
- retval = parseCOSName();
- break;
- case 'n': // null
- {
- String nullString = readString();
- if( nullString.equals( "null") )
- {
- retval = COSNull.NULL;
- }
- else
- {
- retval = PDFOperator.getOperator( nullString );
- }
- break;
- }
- case 't':
- case 'f':
- {
- String next = readString();
- if( next.equals( "true" ) )
- {
- retval = COSBoolean.TRUE;
- break;
- }
- else if( next.equals( "false" ) )
- {
- retval = COSBoolean.FALSE;
- }
- else
- {
- retval = PDFOperator.getOperator( next );
- }
- break;
- }
- case 'R':
- {
- String line = readString();
- if( line.equals( "R" ) )
- {
- retval = new COSObject( null );
- }
- else
- {
- retval = PDFOperator.getOperator( line );
- }
- break;
- }
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
- case '-':
- case '+':
- case '.':
- {
- if( Character.isDigit(c) || c == '-' || c == '+' || c == '.')
- {
- StringBuffer buf = new StringBuffer();
- while( Character.isDigit(( c = (char)pdfSource.peek()) )|| c== '-' || c== '+' || c =='.' )
- {
- buf.append( c );
- pdfSource.read();
- }
- retval = COSNumber.get( buf.toString() );
- }
- else
- {
- throw new IOException( "Unknown dir object c='" + c +
- "' peek='" + (char)pdfSource.peek() + "' " + pdfSource );
- }
- break;
- }
- case 'B':
- {
- String next = readString();
- retval = PDFOperator.getOperator( next );
-
- if( next.equals( "BI" ) )
- {
- lastBIToken = (PDFOperator)retval;
- COSDictionary imageParams = new COSDictionary();
- lastBIToken.setImageParameters( new ImageParameters( imageParams ) );
- Object nextToken = null;
- while( (nextToken = parseNextToken()) instanceof COSName )
- {
- Object value = parseNextToken();
- imageParams.setItem( (COSName)nextToken, (COSBase)value );
- }
- //final token will be the image data, maybe??
- PDFOperator imageData = (PDFOperator)nextToken;
- lastBIToken.setImageData( imageData.getImageData() );
- }
- break;
- }
- case 'I':
- {
- ImageParameters imageParams = lastBIToken.getImageParameters();
- int expectedBytes = (int)Math.ceil(imageParams.getHeight() * imageParams.getWidth() *
- (imageParams.getBitsPerComponent()/8) );
- //Special case for ID operator
- String id = "" + (char)pdfSource.read() + (char)pdfSource.read();
- if( !id.equals( "ID" ) )
- {
- throw new IOException( "Error: Expected operator 'ID' actual='" + id + "'" );
- }
- ByteArrayOutputStream imageData = new ByteArrayOutputStream();
- boolean foundEnd = false;
- if( this.isWhitespace() )
- {
- //pull off the whitespace character
- pdfSource.read();
- }
- int twoBytesAgo = 0;
- int lastByte = pdfSource.read();
- int currentByte = pdfSource.read();
- int count = 0;
- //PDF spec is kinda unclear about this. Should a whitespace
- //always appear before EI? Not sure, I found a PDF
- //(UnderstandingWebSphereClassLoaders.pdf) which has EI as part
- //of the image data and will stop parsing prematurely if there is
- //not a check for <whitespace>EI<whitespace>.
- while( !(isWhitespace( twoBytesAgo ) &&
- lastByte == 'E' &&
- currentByte == 'I' &&
- isWhitespace() //&&
- //amyuni2_05d__pdf1_3_acro4x.pdf has image data that
- //is compressed, so expectedBytes is useless here.
- //count >= expectedBytes
- ) &&
- !pdfSource.isEOF() )
- {
- imageData.write( lastByte );
- twoBytesAgo = lastByte;
- lastByte = currentByte;
- currentByte = pdfSource.read();
- count++;
- }
- pdfSource.unread( 'I' ); //unread the EI operator
- pdfSource.unread( 'E' );
- retval = PDFOperator.getOperator( "ID" );
- ((PDFOperator)retval).setImageData( imageData.toByteArray() );
- break;
- }
- case ']':
- {
- // some ']' around without its previous '['
- // this means a PDF is somewhat corrupt but we will continue to parse.
- pdfSource.read();
- retval = COSNull.NULL; // must be a better solution than null...
- break;
- }
- default:
- {
- //we must be an operator
- String operator = readOperator();
- if( operator.trim().length() == 0 )
- {
- //we have a corrupt stream, stop reading here
- retval = null;
- }
- else
- {
- retval = PDFOperator.getOperator( operator );
- }
- }
-
- }
- if( log.isDebugEnabled() )
- {
- log.debug( "parseNextToken() retval=" + retval + " peek=" + (char)pdfSource.peek() + " end" );
- }
-
- return retval;
- }
-
- /**
- * This will read an operator from the stream.
- *
- * @return The operator that was read from the stream.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- protected String readOperator() throws IOException
- {
- skipSpaces();
-
- //average string size is around 2 and the normal string buffer size is
- //about 16 so lets save some space.
- StringBuffer buffer = new StringBuffer(4);
- while(
- !isWhitespace() &&
- !isClosing() &&
- !pdfSource.isEOF() &&
- pdfSource.peek() != (int)'[' &&
- pdfSource.peek() != (int)'<' &&
- pdfSource.peek() != (int)'(' &&
- pdfSource.peek() != (int)'/' &&
- (pdfSource.peek() < (int)'0' ||
- pdfSource.peek() > (int)'9' ) )
- {
- buffer.append( (char)pdfSource.read() );
- }
- return buffer.toString();
- }
-} \ No newline at end of file
diff --git a/src/main/java/org/pdfbox/pdfparser/PDFXref.java b/src/main/java/org/pdfbox/pdfparser/PDFXref.java
deleted file mode 100644
index abe0f35..0000000
--- a/src/main/java/org/pdfbox/pdfparser/PDFXref.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/**
- * Copyright (c) 2003, www.pdfbox.org
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * 3. Neither the name of pdfbox; nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * http://www.pdfbox.org
- *
- */
-package org.pdfbox.pdfparser;
-
-/**
- * This class represents a PDF xref.
- *
- * @author Ben Litchfield (ben@csh.rit.edu)
- * @version $Revision: 1.3 $
- */
-public class PDFXref
-{
-
- private long count;
- private long start;
-
- /**
- * constructor.
- *
- * @param startValue The start attribute.
- * @param countValue The count attribute.
- */
- public PDFXref( long startValue, long countValue )
- {
- setStart( startValue );
- setCount( countValue );
- }
-
- /**
- * This will get the count attribute.
- *
- * @return The count.
- */
- public long getCount()
- {
- return count;
- }
-
- /**
- * This will get the start attribute.
- *
- * @return The start.
- */
- public long getStart()
- {
- return start;
- }
-
- /**
- * This will set the count attribute.
- *
- * @param newCount The new count.
- */
- private void setCount(long newCount)
- {
- count = newCount;
- }
-
- /**
- * This will set the start attribute.
- *
- * @param newStart The new start attribute.
- */
- private void setStart(long newStart)
- {
- start = newStart;
- }
-} \ No newline at end of file
diff --git a/src/main/java/org/pdfbox/pdfparser/package.html b/src/main/java/org/pdfbox/pdfparser/package.html
deleted file mode 100644
index fe012c1..0000000
--- a/src/main/java/org/pdfbox/pdfparser/package.html
+++ /dev/null
@@ -1,9 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
-<html>
-<head>
-
-</head>
-<body>
-The pdfparser package contains classes to parse PDF documents and objects within the document.
-</body>
-</html>