aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/org/pdfbox/util/PDFStreamEngine.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/org/pdfbox/util/PDFStreamEngine.java')
-rw-r--r--src/main/java/org/pdfbox/util/PDFStreamEngine.java622
1 files changed, 622 insertions, 0 deletions
diff --git a/src/main/java/org/pdfbox/util/PDFStreamEngine.java b/src/main/java/org/pdfbox/util/PDFStreamEngine.java
new file mode 100644
index 0000000..1e05f8a
--- /dev/null
+++ b/src/main/java/org/pdfbox/util/PDFStreamEngine.java
@@ -0,0 +1,622 @@
+/**
+ * Copyright (c) 2003-2005, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.util;
+
+import java.io.IOException;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Stack;
+
+import org.pdfbox.cos.COSObject;
+import org.pdfbox.cos.COSStream;
+import org.pdfbox.exceptions.WrappedIOException;
+
+import org.pdfbox.pdmodel.PDPage;
+import org.pdfbox.pdmodel.PDResources;
+
+import org.pdfbox.pdmodel.font.PDFont;
+
+import org.pdfbox.pdmodel.graphics.PDGraphicsState;
+
+import org.pdfbox.util.operator.OperatorProcessor;
+
+import org.apache.log4j.Logger;
+
+/**
+ * This class will run through a PDF content stream and execute certain operations
+ * and provide a callback interface for clients that want to do things with the stream.
+ * See the PDFTextStripper class for an example of how to use this class.
+ *
+ * @author Ben Litchfield (ben@benlitchfield.com)
+ * @version $Revision: 1.29 $
+ */
+public class PDFStreamEngine
+{
+ private static Logger log = Logger.getLogger(PDFStreamEngine.class);
+
+ static protected final byte[] SPACE_BYTES = { (byte)32 };
+
+ private PDGraphicsState graphicsState = null;
+
+ protected Matrix textMatrix = null;
+ protected Matrix textLineMatrix = null;
+ protected Stack graphicsStack = new Stack();
+ //private PDResources resources = null;
+
+ protected Map operators = new HashMap();
+
+ protected Map fontToAverageWidths = new HashMap();
+
+ protected Stack streamResourcesStack = new Stack();
+
+ protected PDPage page;
+
+ /**
+ * This is a simple internal class used by the Stream engine to handle the
+ * resources stack.
+ */
+ protected static class StreamResources
+ {
+ protected Map fonts;
+ protected Map colorSpaces;
+ protected Map xobjects;
+ protected Map graphicsStates;
+ protected PDResources resources;
+ }
+
+ /**
+ * Constructor.
+ */
+ public PDFStreamEngine()
+ {
+ //default constructor
+ }
+
+ /**
+ * Constructor with engine properties. The property keys are all
+ * PDF operators, the values are class names used to execute those
+ * operators.
+ *
+ * @param properties The engine properties.
+ *
+ * @throws IOException If there is an error setting the engine properties.
+ */
+ public PDFStreamEngine( Properties properties ) throws IOException
+ {
+ try
+ {
+ Iterator keys = properties.keySet().iterator();
+ while( keys.hasNext() )
+ {
+ String operator = (String)keys.next();
+ String operatorClass = properties.getProperty( operator );
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "Operator Class: " + operator + "=" + operatorClass );
+ }
+ OperatorProcessor op = (OperatorProcessor)Class.forName( operatorClass ).newInstance();
+ op.setContext( this );
+ operators.put( operator, op );
+ }
+ }
+ catch( Exception e )
+ {
+ throw new WrappedIOException( e );
+ }
+ }
+
+ /**
+ * This will process the contents of the stream.
+ *
+ * @param aPage The page.
+ * @param resources The location to retrieve resources.
+ * @param cosStream the Stream to execute.
+ *
+ *
+ * @throws IOException if there is an error accessing the stream.
+ */
+ public void processStream( PDPage aPage, PDResources resources, COSStream cosStream ) throws IOException
+ {
+ graphicsState = new PDGraphicsState();
+ textMatrix = null;
+ textLineMatrix = null;
+ graphicsStack.clear();
+ streamResourcesStack.clear();
+ fontToAverageWidths.clear();
+
+ processSubStream( aPage, resources, cosStream );
+ }
+
+ /**
+ * Process a sub stream of the current stream.
+ *
+ * @param aPage The page used for drawing.
+ * @param resources The resources used when processing the stream.
+ * @param cosStream The stream to process.
+ *
+ * @throws IOException If there is an exception while processing the stream.
+ */
+ public void processSubStream( PDPage aPage, PDResources resources, COSStream cosStream ) throws IOException
+ {
+ page = aPage;
+ if( resources != null )
+ {
+ StreamResources sr = new StreamResources();
+ sr.fonts = resources.getFonts();
+ sr.colorSpaces = resources.getColorSpaces();
+ sr.xobjects = resources.getXObjects();
+ sr.graphicsStates = resources.getGraphicsStates();
+ sr.resources = resources;
+ streamResourcesStack.push(sr);
+ }
+ try
+ {
+ List arguments = new ArrayList();
+ long startTokens = System.currentTimeMillis();
+ List tokens = cosStream.getStreamTokens();
+ long stopTokens = System.currentTimeMillis();
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "Getting tokens time=" + (stopTokens-startTokens) );
+ }
+ if( tokens != null )
+ {
+ Iterator iter = tokens.iterator();
+ while( iter.hasNext() )
+ {
+ Object next = iter.next();
+ if( next instanceof COSObject )
+ {
+ arguments.add( ((COSObject)next).getObject() );
+ }
+ else if( next instanceof PDFOperator )
+ {
+ processOperator( (PDFOperator)next, arguments );
+ arguments = new ArrayList();
+ }
+ else
+ {
+ arguments.add( next );
+ }
+ }
+ }
+ }
+ finally
+ {
+ if( resources != null )
+ {
+ streamResourcesStack.pop();
+ }
+ }
+
+ }
+
+ /**
+ * A method provided as an event interface to allow a subclass to perform
+ * some specific functionality when a character needs to be displayed.
+ *
+ * @param text The character to be displayed.
+ */
+ protected void showCharacter( TextPosition text )
+ {
+ //subclasses can override to provide specific functionality.
+ }
+
+ /**
+ * You should override this method if you want to perform an action when a
+ * string is being shown.
+ *
+ * @param string The string to display.
+ *
+ * @throws IOException If there is an error showing the string
+ */
+ public void showString( byte[] string ) throws IOException
+ {
+ float spaceWidth = 0;
+ float spacing = 0;
+ StringBuffer stringResult = new StringBuffer(string.length);
+
+ float characterDisplacement = 0;
+ float spaceDisplacement = 0;
+ float fontSize = graphicsState.getTextState().getFontSize();
+ float horizontalScaling = graphicsState.getTextState().getHorizontalScalingPercent()/100f;
+ float rise = graphicsState.getTextState().getRise();
+ final float wordSpacing = graphicsState.getTextState().getWordSpacing();
+ final float characterSpacing = graphicsState.getTextState().getCharacterSpacing();
+ float wordSpacingDisplacement = 0;
+
+ PDFont font = graphicsState.getTextState().getFont();
+
+ //This will typically be 1000 but in the case of a type3 font
+ //this might be a different number
+ float glyphSpaceToTextSpaceFactor = 1f/font.getFontMatrix().getValue( 0, 0 );
+ Float averageWidth = (Float)fontToAverageWidths.get( font );
+ if( averageWidth == null )
+ {
+ averageWidth = new Float( font.getAverageFontWidth() );
+ fontToAverageWidths.put( font, averageWidth );
+ }
+
+ Matrix initialMatrix = new Matrix();
+ initialMatrix.setValue(0,0,1);
+ initialMatrix.setValue(0,1,0);
+ initialMatrix.setValue(0,2,0);
+ initialMatrix.setValue(1,0,0);
+ initialMatrix.setValue(1,1,1);
+ initialMatrix.setValue(1,2,0);
+ initialMatrix.setValue(2,0,0);
+ initialMatrix.setValue(2,1,rise);
+ initialMatrix.setValue(2,2,1);
+
+
+ //this
+ int codeLength = 1;
+ Matrix ctm = graphicsState.getCurrentTransformationMatrix();
+
+ //lets see what the space displacement should be
+ spaceDisplacement = (font.getFontWidth( SPACE_BYTES, 0, 1 )/glyphSpaceToTextSpaceFactor);
+ if( spaceDisplacement == 0 )
+ {
+ spaceDisplacement = (averageWidth.floatValue()/glyphSpaceToTextSpaceFactor);
+ //The average space width appears to be higher than necessary
+ //so lets make it a little bit smaller.
+ spaceDisplacement *= .80f;
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "Font: Space From Average=" + spaceDisplacement );
+ }
+ }
+ int pageRotation = page.findRotation();
+ Matrix trm = initialMatrix.multiply( textMatrix ).multiply( ctm );
+ float x = trm.getValue(2,0);
+ float y = trm.getValue(2,1);
+ if( pageRotation == 0 )
+ {
+ trm.setValue( 2,1, -y + page.findMediaBox().getHeight() );
+ }
+ else if( pageRotation == 90 )
+ {
+ trm.setValue( 2,0, y );
+ trm.setValue( 2,1, x );
+ }
+ else if( pageRotation == 270 )
+ {
+ trm.setValue( 2,0, -y + page.findMediaBox().getHeight() );
+ trm.setValue( 2,1, x );
+ }
+ for( int i=0; i<string.length; i+=codeLength )
+ {
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "initialMatrix=" + initialMatrix );
+ log.debug( "textMatrix=" + textMatrix );
+ log.debug( "initialMatrix.multiply( textMatrix )=" + initialMatrix.multiply( textMatrix ) );
+ log.debug( "ctm=" + ctm );
+ log.debug( "trm=" + initialMatrix.multiply( textMatrix ).multiply( ctm ) );
+ }
+ codeLength = 1;
+
+ String c = font.encode( string, i, codeLength );
+
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "Character Code=" + string[i] + "='" + c + "'" );
+ }
+ if( c == null && i+1<string.length)
+ {
+ //maybe a multibyte encoding
+ codeLength++;
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "Multibyte Character Code=" + string[i] + string[i+1] );
+ }
+ c = font.encode( string, i, codeLength );
+ }
+ stringResult.append( c );
+
+ //todo, handle horizontal displacement
+ characterDisplacement += (font.getFontWidth( string, i, codeLength )/glyphSpaceToTextSpaceFactor);
+
+
+ // PDF Spec - 5.5.2 Word Spacing
+ //
+ // Word spacing works the same was as character spacing, but applies
+ // only to the space character, code 32.
+ //
+ // Note: Word spacing is applied to every occurrence of the single-byte
+ // character code 32 in a string. This can occur when using a simple
+ // font or a composite font that defines code 32 as a single-byte code.
+ // It does not apply to occurrences of the byte value 32 in multiple-byte
+ // codes.
+ //
+ // RDD - My interpretation of this is that only character code 32's that
+ // encode to spaces should have word spacing applied. Cases have been
+ // observed where a font has a space character with a character code
+ // other than 32, and where word spacing (Tw) was used. In these cases,
+ // applying word spacing to either the non-32 space or to the character
+ // code 32 non-space resulted in errors consistent with this interpretation.
+ //
+
+ boolean withCS = false;
+ if( (string[i] == 0x20) && c.equals( " " ) )
+ {
+ spacing += wordSpacing + characterSpacing;
+ withCS = true;
+ }
+ else
+ {
+ spacing += characterSpacing;
+ }
+
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "Checking code '" + c + "' font=" + graphicsState.getTextState().getFont() +
+ " Tc=" + characterSpacing +
+ " Tw=" + wordSpacing +
+ " fontSize=" + fontSize +
+ " horizontalScaling=" + horizontalScaling +
+ " totalDisp=" + characterDisplacement +
+ " spacing=" + spacing + "(" + withCS + ")" );
+ }
+ // We want to update the textMatrix using the width, in text space units.
+ //
+
+ }
+
+ //The adjustment will always be zero. The adjustment as shown in the
+ //TJ operator will be handled separately.
+ float adjustment=0;
+ //todo, need to compute the horizontal displacement
+ float ty = 0;
+ float tx = ((characterDisplacement-adjustment/glyphSpaceToTextSpaceFactor)*fontSize + spacing)
+ *horizontalScaling;
+
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "disp=" + characterDisplacement + " adj=" + adjustment +
+ " fSize=" + fontSize + " tx=" + tx );
+ }
+
+ float xScale = trm.getXScale();
+ float yScale = trm.getYScale();
+ float xPos = trm.getXPosition();
+ float yPos = trm.getYPosition();
+ spaceWidth = spaceDisplacement * xScale * fontSize;
+ wordSpacingDisplacement = wordSpacing*xScale * fontSize;
+ Matrix td = new Matrix();
+ td.setValue( 2, 0, tx );
+ td.setValue( 2, 1, ty );
+
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "TRM=" + trm );
+ log.debug( "TextMatrix before " + textMatrix );
+ }
+ float xPosBefore = textMatrix.getXPosition();
+ float yPosBefore = textMatrix.getYPosition();
+ textMatrix = td.multiply( textMatrix );
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "TextMatrix after " + textMatrix );
+ }
+ float totalStringDisplacement = 0;
+ if( pageRotation == 0 )
+ {
+ totalStringDisplacement = (textMatrix.getXPosition() - xPosBefore);
+ }
+ else if( pageRotation == 90 )
+ {
+ totalStringDisplacement = (textMatrix.getYPosition() - yPosBefore);
+ }
+ else if( pageRotation == 270 )
+ {
+ totalStringDisplacement = (yPosBefore - textMatrix.getYPosition());
+ }
+ showCharacter(
+ new TextPosition(
+ xPos,
+ yPos,
+ xScale,
+ yScale,
+ totalStringDisplacement,
+ spaceWidth,
+ stringResult.toString(),
+ graphicsState.getTextState().getFont(),
+ graphicsState.getTextState().getFontSize(),
+ wordSpacingDisplacement ));
+ }
+
+ /**
+ * This is used to handle an operation.
+ *
+ * @param operation The operation to perform.
+ * @param arguments The list of arguments.
+ *
+ * @throws IOException If there is an error processing the operation.
+ */
+ public void processOperator( String operation, List arguments ) throws IOException
+ {
+ PDFOperator oper = PDFOperator.getOperator( operation );
+ processOperator( oper, arguments );
+ }
+
+ /**
+ * This is used to handle an operation.
+ *
+ * @param operator The operation to perform.
+ * @param arguments The list of arguments.
+ *
+ * @throws IOException If there is an error processing the operation.
+ */
+ protected void processOperator( PDFOperator operator, List arguments ) throws IOException
+ {
+ String operation = operator.getOperation();
+ if( log.isDebugEnabled() )
+ {
+ log.debug( "processOperator( '" + operation + "' )" );
+ }
+ OperatorProcessor processor = (OperatorProcessor)operators.get( operation );
+ if( processor != null )
+ {
+ processor.process( operator, arguments );
+ }
+ }
+
+ /**
+ * @return Returns the colorSpaces.
+ */
+ public Map getColorSpaces()
+ {
+ return ((StreamResources) streamResourcesStack.peek()).colorSpaces;
+ }
+
+ /**
+ * @return Returns the colorSpaces.
+ */
+ public Map getXObjects()
+ {
+ return ((StreamResources) streamResourcesStack.peek()).xobjects;
+ }
+
+ /**
+ * @param value The colorSpaces to set.
+ */
+ public void setColorSpaces(Map value)
+ {
+ ((StreamResources) streamResourcesStack.peek()).colorSpaces = value;
+ }
+ /**
+ * @return Returns the fonts.
+ */
+ public Map getFonts()
+ {
+ return ((StreamResources) streamResourcesStack.peek()).fonts;
+ }
+ /**
+ * @param value The fonts to set.
+ */
+ public void setFonts(Map value)
+ {
+ ((StreamResources) streamResourcesStack.peek()).fonts = value;
+ }
+ /**
+ * @return Returns the graphicsStack.
+ */
+ public Stack getGraphicsStack()
+ {
+ return graphicsStack;
+ }
+ /**
+ * @param value The graphicsStack to set.
+ */
+ public void setGraphicsStack(Stack value)
+ {
+ graphicsStack = value;
+ }
+ /**
+ * @return Returns the graphicsState.
+ */
+ public PDGraphicsState getGraphicsState()
+ {
+ return graphicsState;
+ }
+ /**
+ * @param value The graphicsState to set.
+ */
+ public void setGraphicsState(PDGraphicsState value)
+ {
+ graphicsState = value;
+ }
+ /**
+ * @return Returns the graphicsStates.
+ */
+ public Map getGraphicsStates()
+ {
+ return ((StreamResources) streamResourcesStack.peek()).graphicsStates;
+ }
+ /**
+ * @param value The graphicsStates to set.
+ */
+ public void setGraphicsStates(Map value)
+ {
+ ((StreamResources) streamResourcesStack.peek()).graphicsStates = value;
+ }
+ /**
+ * @return Returns the textLineMatrix.
+ */
+ public Matrix getTextLineMatrix()
+ {
+ return textLineMatrix;
+ }
+ /**
+ * @param value The textLineMatrix to set.
+ */
+ public void setTextLineMatrix(Matrix value)
+ {
+ textLineMatrix = value;
+ }
+ /**
+ * @return Returns the textMatrix.
+ */
+ public Matrix getTextMatrix()
+ {
+ return textMatrix;
+ }
+ /**
+ * @param value The textMatrix to set.
+ */
+ public void setTextMatrix(Matrix value)
+ {
+ textMatrix = value;
+ }
+ /**
+ * @return Returns the resources.
+ */
+ public PDResources getResources()
+ {
+ return ((StreamResources) streamResourcesStack.peek()).resources;
+ }
+
+ /**
+ * Get the current page that is being processed.
+ *
+ * @return The page being processed.
+ */
+ public PDPage getCurrentPage()
+ {
+ return page;
+ }
+} \ No newline at end of file