/** * Copyright (c) 2003-2005, www.pdfbox.org * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * 3. Neither the name of pdfbox; nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * http://www.pdfbox.org * */ package org.pdfbox.util; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Stack; import org.pdfbox.cos.COSObject; import org.pdfbox.cos.COSStream; import org.pdfbox.exceptions.WrappedIOException; import org.pdfbox.pdmodel.PDPage; import org.pdfbox.pdmodel.PDResources; import org.pdfbox.pdmodel.font.PDFont; import org.pdfbox.pdmodel.graphics.PDGraphicsState; import org.pdfbox.util.operator.OperatorProcessor; import org.apache.log4j.Logger; /** * This class will run through a PDF content stream and execute certain operations * and provide a callback interface for clients that want to do things with the stream. * See the PDFTextStripper class for an example of how to use this class. * * @author Ben Litchfield (ben@benlitchfield.com) * @version $Revision: 1.29 $ */ public class PDFStreamEngine { private static Logger log = Logger.getLogger(PDFStreamEngine.class); static protected final byte[] SPACE_BYTES = { (byte)32 }; private PDGraphicsState graphicsState = null; protected Matrix textMatrix = null; protected Matrix textLineMatrix = null; protected Stack graphicsStack = new Stack(); //private PDResources resources = null; protected Map operators = new HashMap(); protected Map fontToAverageWidths = new HashMap(); protected Stack streamResourcesStack = new Stack(); protected PDPage page; /** * This is a simple internal class used by the Stream engine to handle the * resources stack. */ protected static class StreamResources { protected Map fonts; protected Map colorSpaces; protected Map xobjects; protected Map graphicsStates; protected PDResources resources; } /** * Constructor. */ public PDFStreamEngine() { //default constructor } /** * Constructor with engine properties. The property keys are all * PDF operators, the values are class names used to execute those * operators. * * @param properties The engine properties. * * @throws IOException If there is an error setting the engine properties. */ public PDFStreamEngine( Properties properties ) throws IOException { try { Iterator keys = properties.keySet().iterator(); while( keys.hasNext() ) { String operator = (String)keys.next(); String operatorClass = properties.getProperty( operator ); if( log.isDebugEnabled() ) { log.debug( "Operator Class: " + operator + "=" + operatorClass ); } OperatorProcessor op = (OperatorProcessor)Class.forName( operatorClass ).newInstance(); op.setContext( this ); operators.put( operator, op ); } } catch( Exception e ) { throw new WrappedIOException( e ); } } /** * This will process the contents of the stream. * * @param aPage The page. * @param resources The location to retrieve resources. * @param cosStream the Stream to execute. * * * @throws IOException if there is an error accessing the stream. */ public void processStream( PDPage aPage, PDResources resources, COSStream cosStream ) throws IOException { graphicsState = new PDGraphicsState(); textMatrix = null; textLineMatrix = null; graphicsStack.clear(); streamResourcesStack.clear(); fontToAverageWidths.clear(); processSubStream( aPage, resources, cosStream ); } /** * Process a sub stream of the current stream. * * @param aPage The page used for drawing. * @param resources The resources used when processing the stream. * @param cosStream The stream to process. * * @throws IOException If there is an exception while processing the stream. */ public void processSubStream( PDPage aPage, PDResources resources, COSStream cosStream ) throws IOException { page = aPage; if( resources != null ) { StreamResources sr = new StreamResources(); sr.fonts = resources.getFonts(); sr.colorSpaces = resources.getColorSpaces(); sr.xobjects = resources.getXObjects(); sr.graphicsStates = resources.getGraphicsStates(); sr.resources = resources; streamResourcesStack.push(sr); } try { List arguments = new ArrayList(); long startTokens = System.currentTimeMillis(); List tokens = cosStream.getStreamTokens(); long stopTokens = System.currentTimeMillis(); if( log.isDebugEnabled() ) { log.debug( "Getting tokens time=" + (stopTokens-startTokens) ); } if( tokens != null ) { Iterator iter = tokens.iterator(); while( iter.hasNext() ) { Object next = iter.next(); if( next instanceof COSObject ) { arguments.add( ((COSObject)next).getObject() ); } else if( next instanceof PDFOperator ) { processOperator( (PDFOperator)next, arguments ); arguments = new ArrayList(); } else { arguments.add( next ); } } } } finally { if( resources != null ) { streamResourcesStack.pop(); } } } /** * A method provided as an event interface to allow a subclass to perform * some specific functionality when a character needs to be displayed. * * @param text The character to be displayed. */ protected void showCharacter( TextPosition text ) { //subclasses can override to provide specific functionality. } /** * You should override this method if you want to perform an action when a * string is being shown. * * @param string The string to display. * * @throws IOException If there is an error showing the string */ public void showString( byte[] string ) throws IOException { float spaceWidth = 0; float spacing = 0; StringBuffer stringResult = new StringBuffer(string.length); float characterDisplacement = 0; float spaceDisplacement = 0; float fontSize = graphicsState.getTextState().getFontSize(); float horizontalScaling = graphicsState.getTextState().getHorizontalScalingPercent()/100f; float rise = graphicsState.getTextState().getRise(); final float wordSpacing = graphicsState.getTextState().getWordSpacing(); final float characterSpacing = graphicsState.getTextState().getCharacterSpacing(); float wordSpacingDisplacement = 0; PDFont font = graphicsState.getTextState().getFont(); //This will typically be 1000 but in the case of a type3 font //this might be a different number float glyphSpaceToTextSpaceFactor = 1f/font.getFontMatrix().getValue( 0, 0 ); Float averageWidth = (Float)fontToAverageWidths.get( font ); if( averageWidth == null ) { averageWidth = new Float( font.getAverageFontWidth() ); fontToAverageWidths.put( font, averageWidth ); } Matrix initialMatrix = new Matrix(); initialMatrix.setValue(0,0,1); initialMatrix.setValue(0,1,0); initialMatrix.setValue(0,2,0); initialMatrix.setValue(1,0,0); initialMatrix.setValue(1,1,1); initialMatrix.setValue(1,2,0); initialMatrix.setValue(2,0,0); initialMatrix.setValue(2,1,rise); initialMatrix.setValue(2,2,1); //this int codeLength = 1; Matrix ctm = graphicsState.getCurrentTransformationMatrix(); //lets see what the space displacement should be spaceDisplacement = (font.getFontWidth( SPACE_BYTES, 0, 1 )/glyphSpaceToTextSpaceFactor); if( spaceDisplacement == 0 ) { spaceDisplacement = (averageWidth.floatValue()/glyphSpaceToTextSpaceFactor); //The average space width appears to be higher than necessary //so lets make it a little bit smaller. spaceDisplacement *= .80f; if( log.isDebugEnabled() ) { log.debug( "Font: Space From Average=" + spaceDisplacement ); } } int pageRotation = page.findRotation(); Matrix trm = initialMatrix.multiply( textMatrix ).multiply( ctm ); float x = trm.getValue(2,0); float y = trm.getValue(2,1); if( pageRotation == 0 ) { trm.setValue( 2,1, -y + page.findMediaBox().getHeight() ); } else if( pageRotation == 90 ) { trm.setValue( 2,0, y ); trm.setValue( 2,1, x ); } else if( pageRotation == 270 ) { trm.setValue( 2,0, -y + page.findMediaBox().getHeight() ); trm.setValue( 2,1, x ); } for( int i=0; i