/** * Copyright (c) 2003-2005, www.pdfbox.org * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * 3. Neither the name of pdfbox; nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * http://www.pdfbox.org * */ package org.pdfbox.pdfparser; import java.io.File; import java.io.RandomAccessFile; import java.io.InputStream; import java.io.IOException; import java.util.Iterator; import org.pdfbox.cos.COSBase; import org.pdfbox.cos.COSDictionary; import org.pdfbox.cos.COSDocument; import org.pdfbox.cos.COSObject; import org.pdfbox.cos.COSStream; import org.pdfbox.exceptions.WrappedIOException; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.fdf.FDFDocument; import org.pdfbox.persistence.util.COSObjectKey; import org.apache.log4j.Logger; /** * This class will handle the parsing of the PDF document. * * @author Ben Litchfield (ben@benlitchfield.com) * @version $Revision: 1.47 $ */ public class PDFParser extends BaseParser { private static Logger log = Logger.getLogger( PDFParser.class ); private static final int SPACE_BYTE = 32; private static final String PDF_HEADER = "%PDF-"; private COSDocument document; /** * Temp file directory. */ private File tempDirectory = new File( System.getProperty( "java.io.tmpdir" ) ); private RandomAccessFile raf = null; /** * Constructor. * * @param input The input stream that contains the PDF document. * * @throws IOException If there is an error initializing the stream. */ public PDFParser( InputStream input ) throws IOException { this(input, null); } /** * Constructor to allow control over RandomAccessFile. * @param input The input stream that contains the PDF document. * @param rafi The RandomAccessFile to be used in internal COSDocument * * @throws IOException If there is an error initializing the stream. */ public PDFParser(InputStream input, RandomAccessFile rafi) throws IOException { super(input); this.raf = rafi; } /** * This is the directory where pdfbox will create a temporary file * for storing pdf document stream in. By default this directory will * be the value of the system property java.io.tmpdir. * * @param tmpDir The directory to create scratch files needed to store * pdf document streams. */ public void setTempDirectory( File tmpDir ) { tempDirectory = tmpDir; } /** * This will prase the stream and create the PDF document. This will close * the stream when it is done parsing. * * @throws IOException If there is an error reading from the stream. */ public void parse() throws IOException { try { if ( raf == null ) { document = new COSDocument( tempDirectory ); } else { document = new COSDocument( raf ); } setDocument( document ); String header = readLine(); if( log.isDebugEnabled() ) { log.debug( "Header=" + header ); } document.setHeaderString( header ); if( header.length() < PDF_HEADER.length()+1 ) { throw new IOException( "Error: Header is corrupt '" + header + "'" ); } //sometimes there are some garbage bytes in the header before the header //actually starts, so lets try to find the header first. int headerStart = header.indexOf( PDF_HEADER ); //greater than zero because if it is zero then //there is no point of trimming if( headerStart > 0 ) { //trim off any leading characters header = header.substring( headerStart, header.length() ); } try { float pdfVersion = Float.parseFloat( header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER.length()+3) ) ); document.setVersion( pdfVersion ); } catch( NumberFormatException e ) { throw new IOException( "Error getting pdf version:" + e ); } skipHeaderFillBytes(); Object nextObject; boolean wasLastParsedObjectAnXref = false; try { while( (nextObject = parseObject()) != null ) { if( nextObject instanceof PDFXref ) { PDFXref xref = (PDFXref)nextObject; addXref(xref); wasLastParsedObjectAnXref = true; } else { wasLastParsedObjectAnXref = false; } skipSpaces(); } if( document.getTrailer() == null ) { COSDictionary trailer = new COSDictionary(); Iterator xrefIter = document.getObjectsByType( "XRef" ).iterator(); while( xrefIter.hasNext() ) { COSStream next = (COSStream)((COSObject)xrefIter.next()).getObject(); trailer.addAll( next ); } document.setTrailer( trailer ); } if( !document.isEncrypted() ) { document.dereferenceObjectStreams(); } } catch( IOException e ) { if( wasLastParsedObjectAnXref ) { log.debug( "Skipping some garbage", e ); //Then we assume that there is just random garbage after //the xref, not sure why the PDF spec allows this but it does. } else { //some other error so just pass it along throw e; } } } catch( Throwable t ) { //so if the PDF is corrupt then close the document and clear //all resources to it if( document != null ) { document.close(); } if( t instanceof IOException ) { throw (IOException)t; } else { throw new WrappedIOException( t ); } } finally { pdfSource.close(); } } /** * This will skip a header's binary fill bytes. This is in accordance to * PDF Specification 1.5 pg 68 section 3.4.1 "Syntax.File Structure.File Header" * * @throws IOException If there is an error reading from the stream. */ protected void skipHeaderFillBytes() throws IOException { skipSpaces(); int c = pdfSource.peek(); if( !Character.isDigit( (char)c ) ) { // Fill bytes conform with PDF reference (but without comment sign) // => skip until EOL readLine(); } // else: no fill bytes } /** * This will get the document that was parsed. parse() must be called before this is called. * When you are done with this document you must call close() on it to release * resources. * * @return The document that was parsed. * * @throws IOException If there is an error getting the document. */ public COSDocument getDocument() throws IOException { if( document == null ) { throw new IOException( "You must call parse() before calling getDocument()" ); } return document; } /** * This will get the PD document that was parsed. When you are done with * this document you must call close() on it to release resources. * * @return The document at the PD layer. * * @throws IOException If there is an error getting the document. */ public PDDocument getPDDocument() throws IOException { return new PDDocument( getDocument() ); } /** * This will get the FDF document that was parsed. When you are done with * this document you must call close() on it to release resources. * * @return The document at the PD layer. * * @throws IOException If there is an error getting the document. */ public FDFDocument getFDFDocument() throws IOException { return new FDFDocument( getDocument() ); } /** * This will parse a document object from the stream. * * @return The parsed object. * * @throws IOException If an IO error occurs. */ private Object parseObject() throws IOException { Object object = null; char peekedChar = (char)pdfSource.peek(); if( log.isDebugEnabled() ) { log.debug( "PDFParser.parseObject() peek='" + peekedChar + "'" ); } if( pdfSource.isEOF() ) { if( log.isDebugEnabled() ) { log.debug( "Skipping because of EOF" ); //end of file we will return a null object and call it a day. } } else if( peekedChar == 'x' || peekedChar == 't' || peekedChar == 's') { //System.out.println( "parseObject() parsing xref" ); //FDF documents do not always have the xref if( peekedChar == 'x' || peekedChar == 't' ) { object = parseXrefSection(); } //if peeked char is xref or startxref if( peekedChar == 'x' || peekedChar == 's') { skipSpaces(); while( pdfSource.peek() == 'x' ) { parseXrefSection(); } String startxref = readString(); if( !startxref.equals( "startxref" ) ) { throw new IOException( "expected='startxref' actual='" + startxref + "' " + pdfSource ); } skipSpaces(); //read some integer that is in the stream but PDFBox doesn't use readInt(); } //This MUST be readLine because readString strips out comments //and it will think that %% is a comment in from of the EOF String eof = readExpectedString( "%%EOF" ); if( eof.indexOf( "%%EOF" )== -1 && !pdfSource.isEOF() ) { throw new IOException( "expected='%%EOF' actual='" + eof + "' next=" + readString() + " next=" +readString() ); } else if( !pdfSource.isEOF() ) { //we might really be at the end of the file, there might just be some crap at the //end of the file. if( pdfSource.available() < 1000 ) { //We need to determine if we are at the end of the file. byte[] data = new byte[ 1000 ]; int amountRead = pdfSource.read( data ); if( amountRead != -1 ) { pdfSource.unread( data, 0, amountRead ); } boolean atEndOfFile = true;//we assume yes unless we find another. for( int i=0; i