/** * Copyright (c) 2003-2005, www.pdfbox.org * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * 3. Neither the name of pdfbox; nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * http://www.pdfbox.org * */ package org.pdfbox.util; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Stack; import org.pdfbox.cos.COSObject; import org.pdfbox.cos.COSStream; import org.pdfbox.exceptions.WrappedIOException; import org.pdfbox.pdmodel.PDPage; import org.pdfbox.pdmodel.PDResources; import org.pdfbox.pdmodel.font.PDFont; import org.pdfbox.pdmodel.graphics.PDGraphicsState; import org.pdfbox.util.operator.OperatorProcessor; import org.apache.log4j.Logger; /** * This class will run through a PDF content stream and execute certain operations * and provide a callback interface for clients that want to do things with the stream. * See the PDFTextStripper class for an example of how to use this class. * * @author Ben Litchfield (ben@benlitchfield.com) * @version $Revision: 1.29 $ */ public class PDFStreamEngine { private static Logger log = Logger.getLogger(PDFStreamEngine.class); static protected final byte[] SPACE_BYTES = { (byte)32 }; private PDGraphicsState graphicsState = null; protected Matrix textMatrix = null; protected Matrix textLineMatrix = null; protected Stack graphicsStack = new Stack(); //private PDResources resources = null; protected Map operators = new HashMap(); protected Map fontToAverageWidths = new HashMap(); protected Stack streamResourcesStack = new Stack(); protected PDPage page; /** * This is a simple internal class used by the Stream engine to handle the * resources stack. */ protected static class StreamResources { protected Map fonts; protected Map colorSpaces; protected Map xobjects; protected Map graphicsStates; protected PDResources resources; } /** * Constructor. */ public PDFStreamEngine() { //default constructor } /** * Constructor with engine properties. The property keys are all * PDF operators, the values are class names used to execute those * operators. * * @param properties The engine properties. * * @throws IOException If there is an error setting the engine properties. */ public PDFStreamEngine( Properties properties ) throws IOException { try { Iterator keys = properties.keySet().iterator(); while( keys.hasNext() ) { String operator = (String)keys.next(); String operatorClass = properties.getProperty( operator ); if( log.isDebugEnabled() ) { log.debug( "Operator Class: " + operator + "=" + operatorClass ); } OperatorProcessor op = (OperatorProcessor)Class.forName( operatorClass ).newInstance(); op.setContext( this ); operators.put( operator, op ); } } catch( Exception e ) { throw new WrappedIOException( e ); } } /** * This will process the contents of the stream. * * @param aPage The page. * @param resources The location to retrieve resources. * @param cosStream the Stream to execute. * * * @throws IOException if there is an error accessing the stream. */ public void processStream( PDPage aPage, PDResources resources, COSStream cosStream ) throws IOException { graphicsState = new PDGraphicsState(); textMatrix = null; textLineMatrix = null; graphicsStack.clear(); streamResourcesStack.clear(); fontToAverageWidths.clear(); processSubStream( aPage, resources, cosStream ); } /** * Process a sub stream of the current stream. * * @param aPage The page used for drawing. * @param resources The resources used when processing the stream. * @param cosStream The stream to process. * * @throws IOException If there is an exception while processing the stream. */ public void processSubStream( PDPage aPage, PDResources resources, COSStream cosStream ) throws IOException { page = aPage; if( resources != null ) { StreamResources sr = new StreamResources(); sr.fonts = resources.getFonts(); sr.colorSpaces = resources.getColorSpaces(); sr.xobjects = resources.getXObjects(); sr.graphicsStates = resources.getGraphicsStates(); sr.resources = resources; streamResourcesStack.push(sr); } try { List arguments = new ArrayList(); long startTokens = System.currentTimeMillis(); List tokens = cosStream.getStreamTokens(); long stopTokens = System.currentTimeMillis(); if( log.isDebugEnabled() ) { log.debug( "Getting tokens time=" + (stopTokens-startTokens) ); } if( tokens != null ) { Iterator iter = tokens.iterator(); while( iter.hasNext() ) { Object next = iter.next(); if( next instanceof COSObject ) { arguments.add( ((COSObject)next).getObject() ); } else if( next instanceof PDFOperator ) { processOperator( (PDFOperator)next, arguments ); arguments = new ArrayList(); } else { arguments.add( next ); } } } } finally { if( resources != null ) { streamResourcesStack.pop(); } } } /** * A method provided as an event interface to allow a subclass to perform * some specific functionality when a character needs to be displayed. * * @param text The character to be displayed. */ protected void showCharacter( TextPosition text ) { //subclasses can override to provide specific functionality. } /** * You should override this method if you want to perform an action when a * string is being shown. * * @param string The string to display. * * @throws IOException If there is an error showing the string */ public void showString( byte[] string ) throws IOException { float spaceWidth = 0; float spacing = 0; StringBuffer stringResult = new StringBuffer(string.length); float characterDisplacement = 0; float spaceDisplacement = 0; float fontSize = graphicsState.getTextState().getFontSize(); float horizontalScaling = graphicsState.getTextState().getHorizontalScalingPercent()/100f; float rise = graphicsState.getTextState().getRise(); final float wordSpacing = graphicsState.getTextState().getWordSpacing(); final float characterSpacing = graphicsState.getTextState().getCharacterSpacing(); float wordSpacingDisplacement = 0; PDFont font = graphicsState.getTextState().getFont(); //This will typically be 1000 but in the case of a type3 font //this might be a different number float glyphSpaceToTextSpaceFactor = 1f/font.getFontMatrix().getValue( 0, 0 ); Float averageWidth = (Float)fontToAverageWidths.get( font ); if( averageWidth == null ) { averageWidth = new Float( font.getAverageFontWidth() ); fontToAverageWidths.put( font, averageWidth ); } Matrix initialMatrix = new Matrix(); initialMatrix.setValue(0,0,1); initialMatrix.setValue(0,1,0); initialMatrix.setValue(0,2,0); initialMatrix.setValue(1,0,0); initialMatrix.setValue(1,1,1); initialMatrix.setValue(1,2,0); initialMatrix.setValue(2,0,0); initialMatrix.setValue(2,1,rise); initialMatrix.setValue(2,2,1); //this int codeLength = 1; Matrix ctm = graphicsState.getCurrentTransformationMatrix(); //lets see what the space displacement should be spaceDisplacement = (font.getFontWidth( SPACE_BYTES, 0, 1 )/glyphSpaceToTextSpaceFactor); if( spaceDisplacement == 0 ) { spaceDisplacement = (averageWidth.floatValue()/glyphSpaceToTextSpaceFactor); //The average space width appears to be higher than necessary //so lets make it a little bit smaller. spaceDisplacement *= .80f; if( log.isDebugEnabled() ) { log.debug( "Font: Space From Average=" + spaceDisplacement ); } } int pageRotation = page.findRotation(); Matrix trm = initialMatrix.multiply( textMatrix ).multiply( ctm ); float x = trm.getValue(2,0); float y = trm.getValue(2,1); if( pageRotation == 0 ) { trm.setValue( 2,1, -y + page.findMediaBox().getHeight() ); } else if( pageRotation == 90 ) { trm.setValue( 2,0, y ); trm.setValue( 2,1, x ); } else if( pageRotation == 270 ) { trm.setValue( 2,0, -y + page.findMediaBox().getHeight() ); trm.setValue( 2,1, x ); } for( int i=0; i<string.length; i+=codeLength ) { if( log.isDebugEnabled() ) { log.debug( "initialMatrix=" + initialMatrix ); log.debug( "textMatrix=" + textMatrix ); log.debug( "initialMatrix.multiply( textMatrix )=" + initialMatrix.multiply( textMatrix ) ); log.debug( "ctm=" + ctm ); log.debug( "trm=" + initialMatrix.multiply( textMatrix ).multiply( ctm ) ); } codeLength = 1; String c = font.encode( string, i, codeLength ); if( log.isDebugEnabled() ) { log.debug( "Character Code=" + string[i] + "='" + c + "'" ); } if( c == null && i+1<string.length) { //maybe a multibyte encoding codeLength++; if( log.isDebugEnabled() ) { log.debug( "Multibyte Character Code=" + string[i] + string[i+1] ); } c = font.encode( string, i, codeLength ); } stringResult.append( c ); //todo, handle horizontal displacement characterDisplacement += (font.getFontWidth( string, i, codeLength )/glyphSpaceToTextSpaceFactor); // PDF Spec - 5.5.2 Word Spacing // // Word spacing works the same was as character spacing, but applies // only to the space character, code 32. // // Note: Word spacing is applied to every occurrence of the single-byte // character code 32 in a string. This can occur when using a simple // font or a composite font that defines code 32 as a single-byte code. // It does not apply to occurrences of the byte value 32 in multiple-byte // codes. // // RDD - My interpretation of this is that only character code 32's that // encode to spaces should have word spacing applied. Cases have been // observed where a font has a space character with a character code // other than 32, and where word spacing (Tw) was used. In these cases, // applying word spacing to either the non-32 space or to the character // code 32 non-space resulted in errors consistent with this interpretation. // boolean withCS = false; if( (string[i] == 0x20) && c.equals( " " ) ) { spacing += wordSpacing + characterSpacing; withCS = true; } else { spacing += characterSpacing; } if( log.isDebugEnabled() ) { log.debug( "Checking code '" + c + "' font=" + graphicsState.getTextState().getFont() + " Tc=" + characterSpacing + " Tw=" + wordSpacing + " fontSize=" + fontSize + " horizontalScaling=" + horizontalScaling + " totalDisp=" + characterDisplacement + " spacing=" + spacing + "(" + withCS + ")" ); } // We want to update the textMatrix using the width, in text space units. // } //The adjustment will always be zero. The adjustment as shown in the //TJ operator will be handled separately. float adjustment=0; //todo, need to compute the horizontal displacement float ty = 0; float tx = ((characterDisplacement-adjustment/glyphSpaceToTextSpaceFactor)*fontSize + spacing) *horizontalScaling; if( log.isDebugEnabled() ) { log.debug( "disp=" + characterDisplacement + " adj=" + adjustment + " fSize=" + fontSize + " tx=" + tx ); } float xScale = trm.getXScale(); float yScale = trm.getYScale(); float xPos = trm.getXPosition(); float yPos = trm.getYPosition(); spaceWidth = spaceDisplacement * xScale * fontSize; wordSpacingDisplacement = wordSpacing*xScale * fontSize; Matrix td = new Matrix(); td.setValue( 2, 0, tx ); td.setValue( 2, 1, ty ); if( log.isDebugEnabled() ) { log.debug( "TRM=" + trm ); log.debug( "TextMatrix before " + textMatrix ); } float xPosBefore = textMatrix.getXPosition(); float yPosBefore = textMatrix.getYPosition(); textMatrix = td.multiply( textMatrix ); if( log.isDebugEnabled() ) { log.debug( "TextMatrix after " + textMatrix ); } float totalStringDisplacement = 0; if( pageRotation == 0 ) { totalStringDisplacement = (textMatrix.getXPosition() - xPosBefore); } else if( pageRotation == 90 ) { totalStringDisplacement = (textMatrix.getYPosition() - yPosBefore); } else if( pageRotation == 270 ) { totalStringDisplacement = (yPosBefore - textMatrix.getYPosition()); } showCharacter( new TextPosition( xPos, yPos, xScale, yScale, totalStringDisplacement, spaceWidth, stringResult.toString(), graphicsState.getTextState().getFont(), graphicsState.getTextState().getFontSize(), wordSpacingDisplacement )); } /** * This is used to handle an operation. * * @param operation The operation to perform. * @param arguments The list of arguments. * * @throws IOException If there is an error processing the operation. */ public void processOperator( String operation, List arguments ) throws IOException { PDFOperator oper = PDFOperator.getOperator( operation ); processOperator( oper, arguments ); } /** * This is used to handle an operation. * * @param operator The operation to perform. * @param arguments The list of arguments. * * @throws IOException If there is an error processing the operation. */ protected void processOperator( PDFOperator operator, List arguments ) throws IOException { String operation = operator.getOperation(); if( log.isDebugEnabled() ) { log.debug( "processOperator( '" + operation + "' )" ); } OperatorProcessor processor = (OperatorProcessor)operators.get( operation ); if( processor != null ) { processor.process( operator, arguments ); } } /** * @return Returns the colorSpaces. */ public Map getColorSpaces() { return ((StreamResources) streamResourcesStack.peek()).colorSpaces; } /** * @return Returns the colorSpaces. */ public Map getXObjects() { return ((StreamResources) streamResourcesStack.peek()).xobjects; } /** * @param value The colorSpaces to set. */ public void setColorSpaces(Map value) { ((StreamResources) streamResourcesStack.peek()).colorSpaces = value; } /** * @return Returns the fonts. */ public Map getFonts() { return ((StreamResources) streamResourcesStack.peek()).fonts; } /** * @param value The fonts to set. */ public void setFonts(Map value) { ((StreamResources) streamResourcesStack.peek()).fonts = value; } /** * @return Returns the graphicsStack. */ public Stack getGraphicsStack() { return graphicsStack; } /** * @param value The graphicsStack to set. */ public void setGraphicsStack(Stack value) { graphicsStack = value; } /** * @return Returns the graphicsState. */ public PDGraphicsState getGraphicsState() { return graphicsState; } /** * @param value The graphicsState to set. */ public void setGraphicsState(PDGraphicsState value) { graphicsState = value; } /** * @return Returns the graphicsStates. */ public Map getGraphicsStates() { return ((StreamResources) streamResourcesStack.peek()).graphicsStates; } /** * @param value The graphicsStates to set. */ public void setGraphicsStates(Map value) { ((StreamResources) streamResourcesStack.peek()).graphicsStates = value; } /** * @return Returns the textLineMatrix. */ public Matrix getTextLineMatrix() { return textLineMatrix; } /** * @param value The textLineMatrix to set. */ public void setTextLineMatrix(Matrix value) { textLineMatrix = value; } /** * @return Returns the textMatrix. */ public Matrix getTextMatrix() { return textMatrix; } /** * @param value The textMatrix to set. */ public void setTextMatrix(Matrix value) { textMatrix = value; } /** * @return Returns the resources. */ public PDResources getResources() { return ((StreamResources) streamResourcesStack.peek()).resources; } /** * Get the current page that is being processed. * * @return The page being processed. */ public PDPage getCurrentPage() { return page; } }