From c68ad0ec056b37c82debebcecfcde1866d61b4d9 Mon Sep 17 00:00:00 2001 From: tknall Date: Tue, 25 Nov 2008 12:03:13 +0000 Subject: Removing pdfbox from source. git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@301 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- src/main/java/org/pdfbox/pdfparser/BaseParser.java | 1369 -------------------- .../pdfbox/pdfparser/PDFObjectStreamParser.java | 137 -- src/main/java/org/pdfbox/pdfparser/PDFParser.java | 557 -------- .../java/org/pdfbox/pdfparser/PDFStreamParser.java | 403 ------ src/main/java/org/pdfbox/pdfparser/PDFXref.java | 96 -- src/main/java/org/pdfbox/pdfparser/package.html | 9 - 6 files changed, 2571 deletions(-) delete mode 100644 src/main/java/org/pdfbox/pdfparser/BaseParser.java delete mode 100644 src/main/java/org/pdfbox/pdfparser/PDFObjectStreamParser.java delete mode 100644 src/main/java/org/pdfbox/pdfparser/PDFParser.java delete mode 100644 src/main/java/org/pdfbox/pdfparser/PDFStreamParser.java delete mode 100644 src/main/java/org/pdfbox/pdfparser/PDFXref.java delete mode 100644 src/main/java/org/pdfbox/pdfparser/package.html (limited to 'src/main/java/org/pdfbox/pdfparser') diff --git a/src/main/java/org/pdfbox/pdfparser/BaseParser.java b/src/main/java/org/pdfbox/pdfparser/BaseParser.java deleted file mode 100644 index 3937025..0000000 --- a/src/main/java/org/pdfbox/pdfparser/BaseParser.java +++ /dev/null @@ -1,1369 +0,0 @@ -/** - * Copyright (c) 2003-2005, www.pdfbox.org - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * 3. Neither the name of pdfbox; nor the names of its - * contributors may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * http://www.pdfbox.org - * - */ -package org.pdfbox.pdfparser; - -import java.io.BufferedInputStream; -import java.io.InputStream; -import java.io.IOException; -import java.io.OutputStream; -import java.io.RandomAccessFile; - -import java.util.ArrayList; -import java.util.List; - -import org.pdfbox.io.ByteArrayPushBackInputStream; -import org.pdfbox.io.PushBackInputStream; - -import org.pdfbox.cos.COSArray; -import org.pdfbox.cos.COSBase; -import org.pdfbox.cos.COSBoolean; -import org.pdfbox.cos.COSDictionary; -import org.pdfbox.cos.COSDocument; -import org.pdfbox.cos.COSInteger; -import org.pdfbox.cos.COSName; -import org.pdfbox.cos.COSNull; -import org.pdfbox.cos.COSNumber; -import org.pdfbox.cos.COSObject; -import org.pdfbox.cos.COSStream; -import org.pdfbox.cos.COSString; - -import org.pdfbox.persistence.util.COSObjectKey; -import org.apache.log4j.Logger; - -/** - * This class is used to contain parsing logic that will be used by both the - * PDFParser and the COSStreamParser. - * - * @author Ben Litchfield (ben@benlitchfield.com) - * @version $Revision: 1.57 $ - */ -public abstract class BaseParser -{ - private static Logger log = Logger.getLogger(BaseParser.class); - - /** - * This is a byte array that will be used for comparisons. - */ - public static final byte[] ENDSTREAM = - new byte[] {101,110,100,115,116,114,101,97,109};//"endstream".getBytes( "ISO-8859-1" ); - - /** - * This is a byte array that will be used for comparisons. - */ - public static final String DEF = "def"; - - /** - * This is the stream that will be read from. - */ - //protected PushBackByteArrayStream pdfSource; - protected PushBackInputStream pdfSource; - - /** - * moved xref here, is a persistence construct - * maybe not needed anyway when not read from behind with delayed - * access to objects. - */ - private List xrefs = new ArrayList(); - - private COSDocument document; - - /** - * Constructor. - * - * @param input The input stream to read the data from. - * - * @throws IOException If there is an error reading the input stream. - */ - public BaseParser( InputStream input) throws IOException - { - //pdfSource = new PushBackByteArrayStream( input ); - pdfSource = new PushBackInputStream( new BufferedInputStream( input, 16384 ), 4096 ); - } - - /** - * Constructor. - * - * @param input The array to read the data from. - * - * @throws IOException If there is an error reading the byte data. - */ - protected BaseParser(byte[] input) throws IOException - { - pdfSource = new ByteArrayPushBackInputStream(input); - } - - /** - * Set the document for this stream. - * - * @param doc The current document. - */ - public void setDocument( COSDocument doc ) - { - document = doc; - } - - private static boolean isHexDigit(char ch) - { - return (ch >= '0' && ch <= '9') || - (ch >= 'a' && ch <= 'f') || - (ch >= 'A' && ch <= 'F'); - // the line below can lead to problems with certain versions of the IBM JIT compiler - // (and is slower anyway) - //return (HEXDIGITS.indexOf(ch) != -1); - } - - /** - * This will parse a PDF dictionary value. - * - * @return The parsed Dictionary object. - * - * @throws IOException If there is an error parsing the dictionary object. - */ - private COSBase parseCOSDictionaryValue() throws IOException - { - - if( log.isDebugEnabled() ) - { - log.debug("parseCOSDictionaryValue() " + pdfSource ); - } - COSBase retval = null; - COSBase number = parseDirObject(); - skipSpaces(); - char next = (char)pdfSource.peek(); - if( next >= '0' && next <= '9' ) - { - COSBase generationNumber = parseDirObject(); - skipSpaces(); - char r = (char)pdfSource.read(); - if( r != 'R' ) - { - throw new IOException( "expected='R' actual='" + r + "' " + pdfSource ); - } - COSObjectKey key = new COSObjectKey(((COSInteger) number).intValue(), - ((COSInteger) generationNumber).intValue()); - retval = document.getObjectFromPool(key); - } - else - { - retval = number; - } - return retval; - } - - /** - * This will parse a PDF dictionary. - * - * @return The parsed dictionary. - * - * @throws IOException IF there is an error reading the stream. - */ - protected COSDictionary parseCOSDictionary() throws IOException - { - if( log.isDebugEnabled() ) - { - log.debug("parseCOSDictionary() " + pdfSource ); - } - char c = (char)pdfSource.read(); - if( c != '<') - { - throw new IOException( "expected='<' actual='" + c + "'" ); - } - c = (char)pdfSource.read(); - if( c != '<') - { - throw new IOException( "expected='<' actual='" + c + "' " + pdfSource ); - } - skipSpaces(); - COSDictionary obj = new COSDictionary(); - boolean done = false; - while( !done ) - { - skipSpaces(); - c = (char)pdfSource.peek(); - if( c == '>') - { - done = true; - } - else - { - COSName key = parseCOSName(); - COSBase value = parseCOSDictionaryValue(); - skipSpaces(); - if( ((char)pdfSource.peek()) == 'd' ) - { - //if the next string is 'def' then we are parsing a cmap stream - //and want to ignore it, otherwise throw an exception. - String potentialDEF = readString(); - if( !potentialDEF.equals( DEF ) ) - { - pdfSource.unread( potentialDEF.getBytes() ); - } - else - { - skipSpaces(); - } - } - - if( value == null ) - { - throw new IOException("Bad Dictionary Declaration " + pdfSource ); - } - obj.setItem( key, value ); - } - } - char ch = (char)pdfSource.read(); - if( ch != '>' ) - { - throw new IOException( "expected='>' actual='" + ch + "'" ); - } - ch = (char)pdfSource.read(); - if( ch != '>' ) - { - throw new IOException( "expected='>' actual='" + ch + "'" ); - } - if( log.isDebugEnabled() ) - { - log.debug("parseCOSDictionary() done peek='" + pdfSource.peek() + "'" ); - } - return obj; - } - - /** - * This will read a COSStream from the input stream. - * - * @param file The file to write the stream to when reading. - * @param dic The dictionary that goes with this stream. - * - * @return The parsed pdf stream. - * - * @throws IOException If there is an error reading the stream. - */ - protected COSStream parseCOSStream( COSDictionary dic, RandomAccessFile file ) throws IOException - { - if( log.isDebugEnabled() ) - { - log.debug("parseCOSStream() " + pdfSource ); - } - COSStream stream = new COSStream( dic, file ); - OutputStream out = null; - try - { - String streamString = readString(); - //long streamLength; - - if (!streamString.equals("stream")) - { - throw new IOException("expected='stream' actual='" + streamString + "'"); - } - - //PDF Ref 3.2.7 A stream must be followed by either - //a CRLF or LF but nothing else. - - int whitespace = pdfSource.read(); - - //see brother_scan_cover.pdf, it adds whitespaces - //after the stream but before the start of the - //data, so just read those first - while (whitespace == 0x20) - { - whitespace = pdfSource.read(); - } - - if( whitespace == 0x0D ) - { - whitespace = pdfSource.read(); - if( whitespace != 0x0A ) - { - pdfSource.unread( whitespace ); - //The spec says this is invalid but it happens in the real - //world so we must support it. - //throw new IOException("expected='0x0A' actual='0x" + - // Integer.toHexString(whitespace) + "' " + pdfSource); - } - } - else if (whitespace == 0x0A) - { - //that is fine - } - else - { - //we are in an error. - //but again we will do a lenient parsing and just assume that everything - //is fine - pdfSource.unread( whitespace ); - //throw new IOException("expected='0x0D or 0x0A' actual='0x" + - //Integer.toHexString(whitespace) + "' " + pdfSource); - - } - - - COSBase streamLength = dic.getDictionaryObject(COSName.LENGTH); - long length = -1; - if( streamLength instanceof COSNumber ) - { - length = ((COSNumber)streamLength).intValue(); - } - else if( streamLength instanceof COSObject && - ((COSObject)streamLength).getObject() instanceof COSNumber ) - { - length = ((COSNumber)((COSObject)streamLength).getObject()).intValue(); - } - - //length = -1; - //streamLength = null; - - //Need to keep track of the - out = stream.createFilteredStream( streamLength ); - String endStream = null; - //the length is wrong in some pdf documents which means - //that PDFBox must basically ignore it in order to be able to read - //the most number of PDF documents. This of course is a penalty hit, - //maybe I could implement a faster parser. - /**if( length != -1 ) - { - byte[] buffer = new byte[1024]; - int amountRead = 0; - int totalAmountRead = 0; - while( amountRead != -1 && totalAmountRead < length ) - { - int maxAmountToRead = Math.min(buffer.length, (int)(length-totalAmountRead)); - amountRead = pdfSource.read(buffer,0,maxAmountToRead); - totalAmountRead += amountRead; - if( amountRead != -1 ) - { - out.write( buffer, 0, amountRead ); - } - } - } - else - {**/ - readUntilEndStream( out ); - /**}*/ - skipSpaces(); - endStream = readString(); - - if (!endStream.equals("endstream")) - { - readUntilEndStream( out ); - endStream = readString(); - if( !endStream.equals( "endstream" ) ) - { - throw new IOException("expected='endstream' actual='" + endStream + "' " + pdfSource); - } - } - } - finally - { - if( out != null ) - { - out.close(); - } - } - if( log.isDebugEnabled() ) - { - log.debug("parseCOSStream() done" ); - } - return stream; - } - - private void readUntilEndStream( OutputStream out ) throws IOException - { - int currentIndex = 0; - int byteRead = 0; - //this is the additional bytes buffered but not written - int additionalBytes=0; - byte[] buffer = new byte[ENDSTREAM.length+additionalBytes]; - int writeIndex = 0; - while(!cmpCircularBuffer( buffer, currentIndex, ENDSTREAM ) && byteRead != -1 ) - { - writeIndex = currentIndex - buffer.length; - if( writeIndex >= 0 ) - { - out.write( buffer[writeIndex%buffer.length] ); - } - byteRead = pdfSource.read(); - buffer[currentIndex%buffer.length] = (byte)byteRead; - currentIndex++; - } - - //we want to ignore the end of the line data when reading a stream - //so will make an attempt to ignore it. - /*writeIndex = currentIndex - buffer.length; - if( buffer[writeIndex%buffer.length] == 13 && - buffer[(writeIndex+1)%buffer.length] == 10 ) - { - //then ignore the newline before the endstream - } - else if( buffer[(writeIndex+1)%buffer.length] == 10 ) - { - //Then first byte is data, second byte is newline - out.write( buffer[writeIndex%buffer.length] ); - } - else - { - out.write( buffer[writeIndex%buffer.length] ); - out.write( buffer[(writeIndex+1)%buffer.length] ); - }*/ - - /** - * Old way of handling newlines before endstream - for( int i=0; i=0 && - //buffer[writeIndex%buffer.length] != 10 && - buffer[writeIndex%buffer.length] != 13 ) - { - out.write( buffer[writeIndex%buffer.length] ); - } - currentIndex++; - } - */ - pdfSource.unread( ENDSTREAM ); - - } - - /** - * This basically checks to see if the next compareTo.length bytes of the - * buffer match the compareTo byte array. - */ - private boolean cmpCircularBuffer( byte[] buffer, int currentIndex, byte[] compareTo ) - { - int cmpLen = compareTo.length; - int buflen = buffer.length; - boolean match = true; - int off = currentIndex-cmpLen; - if( off < 0 ) - { - match = false; - } - for( int i=0; match && i 0 && c != -1) - { - char ch = (char)c; - int nextc = -2; // not yet read - //if( log.isDebugEnabled() ) - //{ - // log.debug( "Parsing COSString character '" + c + "' code=" + (int)c ); - //} - - if(ch == closeBrace) - { - braces--; - byte[] nextThreeBytes = new byte[3]; - int amountRead = pdfSource.read(nextThreeBytes); - - //lets handle the special case seen in Bull River Rules and Regulations.pdf - //The dictionary looks like this - // 2 0 obj - // << - // /Type /Info - // /Creator (PaperPort http://www.scansoft.com) - // /Producer (sspdflib 1.0 http://www.scansoft.com) - // /Title ( (5) - // /Author () - // /Subject () - // - // Notice the /Title, the braces are not even but they should - // be. So lets assume that if we encounter an this scenario - // then that - // means that there is an error in the pdf and assume that - // was the end of the document. - if( amountRead == 3 ) - { - if( nextThreeBytes[0] == 0x0d && - nextThreeBytes[1] == 0x0a && - nextThreeBytes[2] == 0x2f ) - { - braces = 0; - } - } - pdfSource.unread( nextThreeBytes, 0, amountRead ); - if( braces != 0 ) - { - retval.append( ch ); - } - } - else if( ch == openBrace ) - { - braces++; - retval.append( ch ); - } - else if( ch == '\\' ) - { - //patched by ram - char next = (char)pdfSource.read(); - switch(next) - { - case 'n': - retval.append( '\n' ); - break; - case 'r': - retval.append( '\r' ); - break; - case 't': - retval.append( '\t' ); - break; - case 'b': - retval.append( '\b' ); - break; - case 'f': - retval.append( '\f' ); - break; - case '(': - case ')': - case '\\': - retval.append( next ); - break; - case 10: - case 13: - //this is a break in the line so ignore it and the newline and continue - c = pdfSource.read(); - while( isEOL(c) && c != -1) - { - c = pdfSource.read(); - } - nextc = c; - break; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - { - StringBuffer octal = new StringBuffer(); - octal.append( next ); - c = pdfSource.read(); - char digit = (char)c; - if( digit >= '0' && digit <= '7' ) - { - octal.append( digit ); - c = pdfSource.read(); - digit = (char)c; - if( digit >= '0' && digit <= '7' ) - { - octal.append( digit ); - } - else - { - nextc = c; - } - } - else - { - nextc = c; - } - - int character = 0; - try - { - character = Integer.parseInt( octal.toString(), 8 ); - } - catch( NumberFormatException e ) - { - throw new IOException( "Error: Expected octal character, actual='" + octal + "'" ); - } - retval.append( character ); - break; - } - default: - { - retval.append( '\\' ); - retval.append( next ); - //another ficken problem with PDF's, sometimes the \ doesn't really - //mean escape like the PDF spec says it does, sometimes is should be literal - //which is what we will assume here. - //throw new IOException( "Unexpected break sequence '" + next + "' " + pdfSource ); - } - } - } - else - { - if( openBrace == '<' ) - { - if( isHexDigit(ch) ) - { - retval.append( ch ); - } - } - else - { - retval.append( ch ); - } - } - if (nextc != -2) - { - c = nextc; - } - else - { - c = pdfSource.read(); - } - } - if (c != -1) - { - pdfSource.unread(c); - } - if( openBrace == '<' ) - { - retval = COSString.createFromHexString( retval.getString() ); - } - if( log.isDebugEnabled() ) - { - log.debug("parseCOSString() done parsed=" + retval ); - } - return retval; - } - - /** - * This will parse a PDF array object. - * - * @return The parsed PDF array. - * - * @throws IOException If there is an error parsing the stream. - */ - protected COSArray parseCOSArray() throws IOException - { - if( log.isDebugEnabled() ) - { - log.debug("parseCOSArray() " + pdfSource ); - } - char ch = (char)pdfSource.read(); - if( ch != '[') - { - throw new IOException( "expected='[' actual='" + ch + "'" ); - } - COSArray po = new COSArray(); - COSBase pbo = null; - skipSpaces(); - int i = 0; - while( ((i = pdfSource.peek()) > 0) && ((char)i != ']') ) - { - pbo = parseDirObject(); - if( pbo instanceof COSObject ) - { - COSInteger genNumber = (COSInteger)po.remove( po.size() -1 ); - COSInteger number = (COSInteger)po.remove( po.size() -1 ); - COSObjectKey key = new COSObjectKey(number.intValue(), genNumber.intValue()); - pbo = document.getObjectFromPool(key); - } - if( pbo != null ) - { - po.add( pbo ); - } - else - { - //it could be a bad object in the array which is just skipped - } - skipSpaces(); - } - pdfSource.read(); //read ']' - skipSpaces(); - if( log.isDebugEnabled() ) - { - log.debug("parseCOSArray() done peek='" + (char)pdfSource.peek() + "'" ); - } - return po; - } - - /** - * Determine if a character terminates a PDF name. - * - * @param ch The character - * @return true if the character terminates a PDF name, otherwise false. - */ - protected boolean isEndOfName(char ch) - { - return (ch == ' ' || ch == 13 || ch == 10 || ch == 9 || ch == '>' || ch == '<' - || ch == '[' || ch =='/' || ch ==']' || ch ==')' || ch =='(' || - ch == -1 //EOF - ); - } - - /** - * This will parse a PDF name from the stream. - * - * @return The parsed PDF name. - * - * @throws IOException If there is an error reading from the stream. - */ - protected COSName parseCOSName() throws IOException - { - if( log.isDebugEnabled() ) - { - log.debug("parseCOSName() " + pdfSource ); - } - COSName retval = null; - int c = pdfSource.read(); - if( (char)c != '/') - { - throw new IOException("expected='/' actual='" + (char)c + "'-" + c + " " + pdfSource ); - } - // costruisce il nome - StringBuffer buffer = new StringBuffer(); - c = pdfSource.read(); - while( c != -1 ) - { - char ch = (char)c; - if(ch == '#') - { - char ch1 = (char)pdfSource.read(); - char ch2 = (char)pdfSource.read(); - - // Prior to PDF v1.2, the # was not a special character. Also, - // it has been observed that various PDF tools do not follow the - // spec with respect to the # escape, even though they report - // PDF versions of 1.2 or later. The solution here is that we - // interpret the # as an escape only when it is followed by two - // valid hex digits. - // - if (isHexDigit(ch1) && isHexDigit(ch2)) - { - String hex = "" + ch1 + ch2; - try - { - buffer.append( (char) Integer.parseInt(hex, 16)); - } - catch (NumberFormatException e) - { - if( log.isDebugEnabled() ) - { - log.debug("isHexDigit(ch1)=" + isHexDigit(ch1) + ", isHexDigit(ch2)=" + isHexDigit(ch2)); - } - throw new IOException("Error: expected hex number, actual='" + hex + "'"); - } - c = pdfSource.read(); - } - else - { - pdfSource.unread(ch2); - c = ch1; - buffer.append( ch ); - } - } - else if (isEndOfName(ch)) - { - break; - } - else - { - buffer.append( ch ); - c = pdfSource.read(); - } - } - if (c != -1) - { - pdfSource.unread(c); - } - retval = COSName.getPDFName( buffer.toString() ); - return retval; - } - - /** - * This will parse a boolean object from the stream. - * - * @return The parsed boolean object. - * - * @throws IOException If an IO error occurs during parsing. - */ - protected COSBoolean parseBoolean() throws IOException - { - COSBoolean retval = null; - char c = (char)pdfSource.peek(); - if( c == 't' ) - { - byte[] trueArray = new byte[ 4 ]; - int amountRead = pdfSource.read( trueArray, 0, 4 ); - String trueString = new String( trueArray, 0, amountRead ); - if( !trueString.equals( "true" ) ) - { - throw new IOException( "Error parsing boolean: expected='true' actual='" + trueString + "'" ); - } - else - { - retval = COSBoolean.TRUE; - } - } - else if( c == 'f' ) - { - byte[] falseArray = new byte[ 5 ]; - int amountRead = pdfSource.read( falseArray, 0, 5 ); - String falseString = new String( falseArray, 0, amountRead ); - if( !falseString.equals( "false" ) ) - { - throw new IOException( "Error parsing boolean: expected='true' actual='" + falseString + "'" ); - } - else - { - retval = COSBoolean.FALSE; - } - } - else - { - throw new IOException( "Error parsing boolean expected='t or f' actual='" + c + "'" ); - } - return retval; - } - - /** - * This will parse a directory object from the stream. - * - * @return The parsed object. - * - * @throws IOException If there is an error during parsing. - */ - protected COSBase parseDirObject() throws IOException - { - if( log.isDebugEnabled() ) - { - log.debug("parseDirObject() " + pdfSource ); - } - COSBase retval = null; - - skipSpaces(); - int nextByte = pdfSource.peek(); - char c = (char)nextByte; - switch(c) - { - case '<': - { - int leftBracket = pdfSource.read();//pull off first left bracket - c = (char)pdfSource.peek(); //check for second left bracket - pdfSource.unread( leftBracket ); - if(c == '<') - { - - retval = parseCOSDictionary(); - skipSpaces(); - } - else - { - retval = parseCOSString(); - } - break; - } - case '[': // array - { - retval = parseCOSArray(); - break; - } - case '(': - retval = parseCOSString(); - break; - case '/': // name - retval = parseCOSName(); - break; - case 'n': // null - { - String nullString = readString(); - if( !nullString.equals( "null") ) - { - throw new IOException("Expected='null' actual='" + nullString + "'"); - } - retval = COSNull.NULL; - break; - } - case 't': - { - byte[] trueBytes = new byte[4]; - int amountRead = pdfSource.read( trueBytes, 0, 4 ); - String trueString = new String( trueBytes, 0, amountRead ); - if( trueString.equals( "true" ) ) - { - retval = COSBoolean.TRUE; - } - else - { - throw new IOException( "expected true actual='" + trueString + "' " + pdfSource ); - } - break; - } - case 'f': - { - byte[] falseBytes = new byte[5]; - int amountRead = pdfSource.read( falseBytes, 0, 5 ); - String falseString = new String( falseBytes, 0, amountRead ); - if( falseString.equals( "false" ) ) - { - retval = COSBoolean.FALSE; - } - else - { - throw new IOException( "expected false actual='" + falseString + "' " + pdfSource ); - } - break; - } - case 'R': - pdfSource.read(); - retval = new COSObject(null); - break; - case (char)-1: - return null; - default: - { - if( Character.isDigit(c) || c == '-' || c == '+' || c == '.') - { - StringBuffer buf = new StringBuffer(); - int ic = pdfSource.read(); - c = (char)ic; - while( Character.isDigit( c )|| - c == '-' || - c == '+' || - c == '.' || - c == 'E' || - c == 'e' ) - { - buf.append( c ); - ic = pdfSource.read(); - c = (char)ic; - } - if( ic != -1 ) - { - pdfSource.unread( ic ); - } - retval = COSNumber.get( buf.toString() ); - } - else - { - //This is not suppose to happen, but we will allow for it - //so we are more compatible with POS writers that don't - //follow the spec - String badString = readString(); - //throw new IOException( "Unknown dir object c='" + c + - //"' peek='" + (char)pdfSource.peek() + "' " + pdfSource ); - if( log.isDebugEnabled() ) - { - log.debug("parseDirObject() bad DIR object found. ignoring: '" + badString + "'"); - } - if( badString == null || badString.length() == 0 ) - { - int peek = pdfSource.peek(); - // we can end up in an infinite loop otherwise - throw new IOException( "Unknown dir object c='" + c + - "' cInt=" + (int)c + " peek='" + (char)peek + "' peekInt=" + peek + " " + pdfSource ); - } - - } - } - } - if( log.isDebugEnabled() ) - { - log.debug("parseDirObject() done retval=" +retval ); - } - return retval; - } - - /** - * This will read the next string from the stream. - * - * @return The string that was read from the stream. - * - * @throws IOException If there is an error reading from the stream. - */ - protected String readString() throws IOException - { - skipSpaces(); - StringBuffer buffer = new StringBuffer(); - int c = pdfSource.read(); - while( !isEndOfName((char)c) && !isClosing(c) && c != -1 ) - { - buffer.append( (char)c ); - c = pdfSource.read(); - } - if (c != -1) - { - pdfSource.unread(c); - } - return buffer.toString(); - } - - /** - * This will read bytes until the end of line marker occurs. - * - * @param theString The next expected string in the stream. - * - * @return The characters between the current position and the end of the line. - * - * @throws IOException If there is an error reading from the stream or theString does not match what was read. - */ - protected String readExpectedString( String theString ) throws IOException - { - int c = pdfSource.read(); - while( isWhitespace(c) && c != -1) - { - c = pdfSource.read(); - } - StringBuffer buffer = new StringBuffer( theString.length() ); - int charsRead = 0; - while( !isEOL(c) && c != -1 && charsRead < theString.length() ) - { - char next = (char)c; - buffer.append( next ); - if( theString.charAt( charsRead ) == next ) - { - charsRead++; - } - else - { - throw new IOException( "Error: Expected to read '" + theString + - "' instead started reading '" +buffer.toString() + "'" ); - } - c = pdfSource.read(); - } - while( isEOL(c) && c != -1 ) - { - c = pdfSource.read(); - } - if (c != -1) - { - pdfSource.unread(c); - } - return buffer.toString(); - } - - /** - * This will read the next string from the stream up to a certain length. - * - * @param length The length to stop reading at. - * - * @return The string that was read from the stream of length 0 to length. - * - * @throws IOException If there is an error reading from the stream. - */ - protected String readString( int length ) throws IOException - { - skipSpaces(); - - int c = pdfSource.read(); - - //average string size is around 2 and the normal string buffer size is - //about 16 so lets save some space. - StringBuffer buffer = new StringBuffer(length); - while( !isWhitespace(c) && !isClosing(c) && c != -1 && buffer.length() < length && - c != '[' && - c != '<' && - c != '(' && - c != '/' ) - { - buffer.append( (char)c ); - c = pdfSource.read(); - } - if (c != -1) - { - pdfSource.unread(c); - } - return buffer.toString(); - } - - /** - * This will tell if the next character is a closing brace( close of PDF array ). - * - * @return true if the next byte is ']', false otherwise. - * - * @throws IOException If an IO error occurs. - */ - protected boolean isClosing() throws IOException - { - return isClosing(pdfSource.peek()); - } - - /** - * This will tell if the next character is a closing brace( close of PDF array ). - * - * @param c The character to check against end of line - * @return true if the next byte is ']', false otherwise. - */ - protected boolean isClosing(int c) - { - return c == ']'; - } - - /** - * This will read bytes until the end of line marker occurs. - * - * @return The characters between the current position and the end of the line. - * - * @throws IOException If there is an error reading from the stream. - */ - protected String readLine() throws IOException - { - int c = pdfSource.read(); - while(isWhitespace(c) && c != -1) - { - c = pdfSource.read(); - } - StringBuffer buffer = new StringBuffer( 11 ); - - while( !isEOL(c) && c != -1 ) - { - buffer.append( (char)c ); - c = pdfSource.read(); - } - while( isEOL(c) && c != -1 ) - { - c = pdfSource.read(); - } - if (c != -1) - { - pdfSource.unread(c); - } - return buffer.toString(); - } - - /** - * This will tell if the next byte to be read is an end of line byte. - * - * @return true if the next byte is 0x0A or 0x0D. - * - * @throws IOException If there is an error reading from the stream. - */ - protected boolean isEOL() throws IOException - { - return isEOL(pdfSource.peek()); - } - - /** - * This will tell if the next byte to be read is an end of line byte. - * - * @param c The character to check against end of line - * @return true if the next byte is 0x0A or 0x0D. - */ - protected boolean isEOL(int c) - { - return c == 10 || c == 13; - } - - /** - * This will tell if the next byte is whitespace or not. - * - * @return true if the next byte in the stream is a whitespace character. - * - * @throws IOException If there is an error reading from the stream. - */ - protected boolean isWhitespace() throws IOException - { - return isWhitespace( pdfSource.peek() ); - } - - /** - * This will tell if the next byte is whitespace or not. - * - * @param c The character to check against whitespace - * - * @return true if the next byte in the stream is a whitespace character. - */ - protected boolean isWhitespace( int c ) - { - return c == 0 || c == 9 || c == 12 || c == 10 - || c == 13 || c == 32; - } - - /** - * This will skip all spaces and comments that are present. - * - * @throws IOException If there is an error reading from the stream. - */ - protected void skipSpaces() throws IOException - { - //log( "skipSpaces() " + pdfSource ); - int c = pdfSource.read(); - // identical to, but faster as: isWhiteSpace(c) || c == 37 - while(c == 0 || c == 9 || c == 12 || c == 10 - || c == 13 || c == 32 || c == 37)//37 is the % character, a comment - { - if ( c == 37 ) - { - // skip past the comment section - c = pdfSource.read(); - while(!isEOL(c) && c != -1) - { - c = pdfSource.read(); - } - } - else - { - c = pdfSource.read(); - } - } - if (c != -1) - { - pdfSource.unread(c); - } - //log( "skipSpaces() done peek='" + (char)pdfSource.peek() + "'" ); - } - - /** - * this will compare two byte arrays. - * - * @param first The first byte array to compare. - * @param second The second byte array to compare. - * - * @return true if both arrays are the same AND forall i : first[i] = second[i] - */ - private boolean cmpArray( byte[] first, byte[] second ) - { - return cmpArray( first, 0, second ); - } - - /** - * This will compare two arrays for equality. - * - * @param first The first array to compare. - * @param firstOffset The first byte to start comparing. - * @param second The second array to compare. - */ - private boolean cmpArray( byte[] first, int firstOffset, byte[] second ) - { - boolean retval = true; - if( first.length-firstOffset >= second.length ) - { - int arrayLength = second.length; - for( int i =0; i 0 ) - { - //trim off any leading characters - header = header.substring( headerStart, header.length() ); - } - - try - { - float pdfVersion = Float.parseFloat( - header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER.length()+3) ) ); - document.setVersion( pdfVersion ); - } - catch( NumberFormatException e ) - { - throw new IOException( "Error getting pdf version:" + e ); - } - - skipHeaderFillBytes(); - - - Object nextObject; - boolean wasLastParsedObjectAnXref = false; - try - { - while( (nextObject = parseObject()) != null ) - { - if( nextObject instanceof PDFXref ) - { - PDFXref xref = (PDFXref)nextObject; - addXref(xref); - wasLastParsedObjectAnXref = true; - } - else - { - wasLastParsedObjectAnXref = false; - } - skipSpaces(); - } - if( document.getTrailer() == null ) - { - COSDictionary trailer = new COSDictionary(); - Iterator xrefIter = document.getObjectsByType( "XRef" ).iterator(); - while( xrefIter.hasNext() ) - { - COSStream next = (COSStream)((COSObject)xrefIter.next()).getObject(); - trailer.addAll( next ); - } - document.setTrailer( trailer ); - } - if( !document.isEncrypted() ) - { - document.dereferenceObjectStreams(); - } - } - catch( IOException e ) - { - if( wasLastParsedObjectAnXref ) - { - log.debug( "Skipping some garbage", e ); - //Then we assume that there is just random garbage after - //the xref, not sure why the PDF spec allows this but it does. - } - else - { - //some other error so just pass it along - throw e; - } - } - } - catch( Throwable t ) - { - //so if the PDF is corrupt then close the document and clear - //all resources to it - if( document != null ) - { - document.close(); - } - if( t instanceof IOException ) - { - throw (IOException)t; - } - else - { - throw new WrappedIOException( t ); - } - } - finally - { - pdfSource.close(); - } - } - - /** - * This will skip a header's binary fill bytes. This is in accordance to - * PDF Specification 1.5 pg 68 section 3.4.1 "Syntax.File Structure.File Header" - * - * @throws IOException If there is an error reading from the stream. - */ - protected void skipHeaderFillBytes() throws IOException - { - skipSpaces(); - int c = pdfSource.peek(); - - if( !Character.isDigit( (char)c ) ) - { - // Fill bytes conform with PDF reference (but without comment sign) - // => skip until EOL - readLine(); - } - // else: no fill bytes - } - - /** - * This will get the document that was parsed. parse() must be called before this is called. - * When you are done with this document you must call close() on it to release - * resources. - * - * @return The document that was parsed. - * - * @throws IOException If there is an error getting the document. - */ - public COSDocument getDocument() throws IOException - { - if( document == null ) - { - throw new IOException( "You must call parse() before calling getDocument()" ); - } - return document; - } - - /** - * This will get the PD document that was parsed. When you are done with - * this document you must call close() on it to release resources. - * - * @return The document at the PD layer. - * - * @throws IOException If there is an error getting the document. - */ - public PDDocument getPDDocument() throws IOException - { - return new PDDocument( getDocument() ); - } - - /** - * This will get the FDF document that was parsed. When you are done with - * this document you must call close() on it to release resources. - * - * @return The document at the PD layer. - * - * @throws IOException If there is an error getting the document. - */ - public FDFDocument getFDFDocument() throws IOException - { - return new FDFDocument( getDocument() ); - } - - /** - * This will parse a document object from the stream. - * - * @return The parsed object. - * - * @throws IOException If an IO error occurs. - */ - private Object parseObject() throws IOException - { - Object object = null; - char peekedChar = (char)pdfSource.peek(); - if( log.isDebugEnabled() ) - { - log.debug( "PDFParser.parseObject() peek='" + peekedChar + "'" ); - } - if( pdfSource.isEOF() ) - { - if( log.isDebugEnabled() ) - { - log.debug( "Skipping because of EOF" ); - //end of file we will return a null object and call it a day. - } - } - else if( peekedChar == 'x' || - peekedChar == 't' || - peekedChar == 's') - { - //System.out.println( "parseObject() parsing xref" ); - - //FDF documents do not always have the xref - if( peekedChar == 'x' || peekedChar == 't' ) - { - object = parseXrefSection(); - } - - //if peeked char is xref or startxref - if( peekedChar == 'x' || peekedChar == 's') - { - skipSpaces(); - while( pdfSource.peek() == 'x' ) - { - parseXrefSection(); - } - String startxref = readString(); - if( !startxref.equals( "startxref" ) ) - { - throw new IOException( "expected='startxref' actual='" + startxref + "' " + pdfSource ); - } - skipSpaces(); - //read some integer that is in the stream but PDFBox doesn't use - readInt(); - } - - //This MUST be readLine because readString strips out comments - //and it will think that %% is a comment in from of the EOF - String eof = readExpectedString( "%%EOF" ); - if( eof.indexOf( "%%EOF" )== -1 && !pdfSource.isEOF() ) - { - throw new IOException( "expected='%%EOF' actual='" + eof + "' next=" + readString() + - " next=" +readString() ); - } - else if( !pdfSource.isEOF() ) - { - //we might really be at the end of the file, there might just be some crap at the - //end of the file. - if( pdfSource.available() < 1000 ) - { - //We need to determine if we are at the end of the file. - byte[] data = new byte[ 1000 ]; - - int amountRead = pdfSource.read( data ); - if( amountRead != -1 ) - { - pdfSource.unread( data, 0, amountRead ); - } - boolean atEndOfFile = true;//we assume yes unless we find another. - for( int i=0; iEI. - while( !(isWhitespace( twoBytesAgo ) && - lastByte == 'E' && - currentByte == 'I' && - isWhitespace() //&& - //amyuni2_05d__pdf1_3_acro4x.pdf has image data that - //is compressed, so expectedBytes is useless here. - //count >= expectedBytes - ) && - !pdfSource.isEOF() ) - { - imageData.write( lastByte ); - twoBytesAgo = lastByte; - lastByte = currentByte; - currentByte = pdfSource.read(); - count++; - } - pdfSource.unread( 'I' ); //unread the EI operator - pdfSource.unread( 'E' ); - retval = PDFOperator.getOperator( "ID" ); - ((PDFOperator)retval).setImageData( imageData.toByteArray() ); - break; - } - case ']': - { - // some ']' around without its previous '[' - // this means a PDF is somewhat corrupt but we will continue to parse. - pdfSource.read(); - retval = COSNull.NULL; // must be a better solution than null... - break; - } - default: - { - //we must be an operator - String operator = readOperator(); - if( operator.trim().length() == 0 ) - { - //we have a corrupt stream, stop reading here - retval = null; - } - else - { - retval = PDFOperator.getOperator( operator ); - } - } - - } - if( log.isDebugEnabled() ) - { - log.debug( "parseNextToken() retval=" + retval + " peek=" + (char)pdfSource.peek() + " end" ); - } - - return retval; - } - - /** - * This will read an operator from the stream. - * - * @return The operator that was read from the stream. - * - * @throws IOException If there is an error reading from the stream. - */ - protected String readOperator() throws IOException - { - skipSpaces(); - - //average string size is around 2 and the normal string buffer size is - //about 16 so lets save some space. - StringBuffer buffer = new StringBuffer(4); - while( - !isWhitespace() && - !isClosing() && - !pdfSource.isEOF() && - pdfSource.peek() != (int)'[' && - pdfSource.peek() != (int)'<' && - pdfSource.peek() != (int)'(' && - pdfSource.peek() != (int)'/' && - (pdfSource.peek() < (int)'0' || - pdfSource.peek() > (int)'9' ) ) - { - buffer.append( (char)pdfSource.read() ); - } - return buffer.toString(); - } -} \ No newline at end of file diff --git a/src/main/java/org/pdfbox/pdfparser/PDFXref.java b/src/main/java/org/pdfbox/pdfparser/PDFXref.java deleted file mode 100644 index abe0f35..0000000 --- a/src/main/java/org/pdfbox/pdfparser/PDFXref.java +++ /dev/null @@ -1,96 +0,0 @@ -/** - * Copyright (c) 2003, www.pdfbox.org - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * 3. Neither the name of pdfbox; nor the names of its - * contributors may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * http://www.pdfbox.org - * - */ -package org.pdfbox.pdfparser; - -/** - * This class represents a PDF xref. - * - * @author Ben Litchfield (ben@csh.rit.edu) - * @version $Revision: 1.3 $ - */ -public class PDFXref -{ - - private long count; - private long start; - - /** - * constructor. - * - * @param startValue The start attribute. - * @param countValue The count attribute. - */ - public PDFXref( long startValue, long countValue ) - { - setStart( startValue ); - setCount( countValue ); - } - - /** - * This will get the count attribute. - * - * @return The count. - */ - public long getCount() - { - return count; - } - - /** - * This will get the start attribute. - * - * @return The start. - */ - public long getStart() - { - return start; - } - - /** - * This will set the count attribute. - * - * @param newCount The new count. - */ - private void setCount(long newCount) - { - count = newCount; - } - - /** - * This will set the start attribute. - * - * @param newStart The new start attribute. - */ - private void setStart(long newStart) - { - start = newStart; - } -} \ No newline at end of file diff --git a/src/main/java/org/pdfbox/pdfparser/package.html b/src/main/java/org/pdfbox/pdfparser/package.html deleted file mode 100644 index fe012c1..0000000 --- a/src/main/java/org/pdfbox/pdfparser/package.html +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - -The pdfparser package contains classes to parse PDF documents and objects within the document. - - -- cgit v1.2.3