aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/at/knowcenter/wag/egov/egiz/pdf/PDFPage.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/at/knowcenter/wag/egov/egiz/pdf/PDFPage.java')
-rw-r--r--src/main/java/at/knowcenter/wag/egov/egiz/pdf/PDFPage.java539
1 files changed, 539 insertions, 0 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/PDFPage.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/PDFPage.java
new file mode 100644
index 0000000..bed1b65
--- /dev/null
+++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/PDFPage.java
@@ -0,0 +1,539 @@
+/**
+ * <copyright> Copyright (c) 2006 by Know-Center, Graz, Austria </copyright>
+ *
+ * This software is the confidential and proprietary information of Know-Center,
+ * Graz, Austria. You shall not disclose such Confidential Information and shall
+ * use it only in accordance with the terms of the license agreement you entered
+ * into with Know-Center.
+ *
+ * KNOW-CENTER MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
+ * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
+ * NON-INFRINGEMENT. KNOW-CENTER SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY
+ * LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS
+ * DERIVATIVES.
+ *
+ * $Id: PDFPage.java,v 1.5 2006/10/31 08:09:33 wprinz Exp $
+ */
+package at.knowcenter.wag.egov.egiz.pdf;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.log4j.Logger;
+import org.pdfbox.cos.COSName;
+import org.pdfbox.cos.COSStream;
+import org.pdfbox.pdmodel.PDPage;
+import org.pdfbox.pdmodel.PDResources;
+import org.pdfbox.pdmodel.common.PDStream;
+import org.pdfbox.pdmodel.graphics.xobject.PDXObject;
+import org.pdfbox.pdmodel.graphics.xobject.PDXObjectForm;
+import org.pdfbox.util.Matrix;
+import org.pdfbox.util.PDFOperator;
+import org.pdfbox.util.PDFTextStripper;
+import org.pdfbox.util.TextPosition;
+import org.pdfbox.util.operator.OperatorProcessor;
+
+import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger;
+
+/**
+ * PDFPage is an inner class that is used to calculate the page length of a PDF
+ * Document page. It extends the PDFTextStripper class and implement one
+ * interested method: {@link PDFPage#showCharacter(TextPosition)}<br>
+ * This method is called when processing the FileStream. By calling the method
+ * {@link org.pdfbox.util.PDFStreamEngine#processStream(org.pdfbox.pdmodel.PDPage, org.pdfbox.pdmodel.PDResources, org.pdfbox.cos.COSStream)}
+ * the implemented method showCharacter is called.
+ *
+ * @author wlackner
+ * @see PDFTextStripper
+ */
+public class PDFPage extends PDFTextStripper
+{
+ /**
+ * The logger definition.
+ */
+ private static final Logger logger_ = ConfigLogger.getLogger(PDFPage.class);
+
+ /**
+ * The maximum (lowest) y position of a character.
+ */
+ protected float max_character_ypos = Float.NEGATIVE_INFINITY;
+
+ /**
+ * The maximum (lowest y position of an image.
+ */
+ protected float max_image_ypos = Float.NEGATIVE_INFINITY;
+
+ /**
+ * The empty constructor.
+ *
+ * @throws IOException
+ */
+ public PDFPage() throws IOException
+ {
+ super();
+
+ OperatorProcessor newInvoke = new MyInvoke();
+ newInvoke.setContext(this);
+ operators.put("Do", newInvoke);
+ }
+
+ // /**
+ // * You should override this method if you want to perform an action when a
+ // * string is being shown.
+ // *
+ // * @param string The string to display.
+ // *
+ // * @throws IOException If there is an error showing the string
+ // */
+ // public void showString( byte[] string ) throws IOException
+ // {
+ // float spaceWidth = 0;
+ // float spacing = 0;
+ // StringBuffer stringResult = new StringBuffer(string.length);
+ //
+ // float characterDisplacement = 0;
+ // float spaceDisplacement = 0;
+ //
+ // PDGraphicsState graphicsState = getGraphicsState();
+ // float fontSize = graphicsState.getTextState().getFontSize();
+ // float horizontalScaling =
+ // graphicsState.getTextState().getHorizontalScalingPercent()/100f;
+ // float rise = graphicsState.getTextState().getRise();
+ // final float wordSpacing = graphicsState.getTextState().getWordSpacing();
+ // final float characterSpacing =
+ // graphicsState.getTextState().getCharacterSpacing();
+ // float wordSpacingDisplacement = 0;
+ //
+ // PDFont font = graphicsState.getTextState().getFont();
+ //
+ // //This will typically be 1000 but in the case of a type3 font
+ // //this might be a different number
+ // float glyphSpaceToTextSpaceFactor = 1f/font.getFontMatrix().getValue( 0, 0
+ // );
+ // Float averageWidth = (Float)fontToAverageWidths.get( font );
+ // if( averageWidth == null )
+ // {
+ // averageWidth = new Float( font.getAverageFontWidth() );
+ // fontToAverageWidths.put( font, averageWidth );
+ // }
+ //
+ // Matrix initialMatrix = new Matrix();
+ // initialMatrix.setValue(0,0,1);
+ // initialMatrix.setValue(0,1,0);
+ // initialMatrix.setValue(0,2,0);
+ // initialMatrix.setValue(1,0,0);
+ // initialMatrix.setValue(1,1,1);
+ // initialMatrix.setValue(1,2,0);
+ // initialMatrix.setValue(2,0,0);
+ // initialMatrix.setValue(2,1,rise);
+ // initialMatrix.setValue(2,2,1);
+ //
+ //
+ // //this
+ // int codeLength = 1;
+ // Matrix ctm = graphicsState.getCurrentTransformationMatrix();
+ //
+ // //lets see what the space displacement should be
+ // spaceDisplacement = (font.getFontWidth( SPACE_BYTES, 0, 1
+ // )/glyphSpaceToTextSpaceFactor);
+ // if( spaceDisplacement == 0 )
+ // {
+ // spaceDisplacement =
+ // (averageWidth.floatValue()/glyphSpaceToTextSpaceFactor);
+ // //The average space width appears to be higher than necessary
+ // //so lets make it a little bit smaller.
+ // spaceDisplacement *= .80f;
+ // if( log.isDebugEnabled() )
+ // {
+ // log.debug( "Font: Space From Average=" + spaceDisplacement );
+ // }
+ // }
+ // int pageRotation = page.findRotation();
+ //
+ // // very strange.... the ctms are multiplied by right, but suddenly the
+ // textM is multiplied from the left.
+ // // but: PDF matrices are multiplied from left ==> ctm is wrong
+ // Matrix trm = initialMatrix.multiply( textMatrix ).multiply( ctm );
+ // float x = trm.getValue(2,0);
+ // float y = trm.getValue(2,1);
+ // float flipped_y = -y + page.findMediaBox().getHeight();
+ // if( pageRotation == 0 )
+ // {
+ // trm.setValue( 2,1, flipped_y );
+ // }
+ // else if( pageRotation == 90 )
+ // {
+ // trm.setValue( 2,0, y );
+ // trm.setValue( 2,1, x );
+ // }
+ // else if( pageRotation == 270 )
+ // {
+ // trm.setValue( 2,0, flipped_y );
+ // trm.setValue( 2,1, x );
+ // }
+ // for( int i=0; i<string.length; i+=codeLength )
+ // {
+ // if( log.isDebugEnabled() )
+ // {
+ // log.debug( "initialMatrix=" + initialMatrix );
+ // log.debug( "textMatrix=" + textMatrix );
+ // log.debug( "initialMatrix.multiply( textMatrix )=" +
+ // initialMatrix.multiply( textMatrix ) );
+ // log.debug( "ctm=" + ctm );
+ // log.debug( "trm=" + initialMatrix.multiply( textMatrix ).multiply( ctm ) );
+ // }
+ // codeLength = 1;
+ //
+ // String c = font.encode( string, i, codeLength );
+ //
+ // if( log.isDebugEnabled() )
+ // {
+ // log.debug( "Character Code=" + string[i] + "='" + c + "'" );
+ // }
+ // if( c == null && i+1<string.length)
+ // {
+ // //maybe a multibyte encoding
+ // codeLength++;
+ // if( log.isDebugEnabled() )
+ // {
+ // log.debug( "Multibyte Character Code=" + string[i] + string[i+1] );
+ // }
+ // c = font.encode( string, i, codeLength );
+ // }
+ // stringResult.append( c );
+ //
+ // //todo, handle horizontal displacement
+ // characterDisplacement += (font.getFontWidth( string, i, codeLength
+ // )/glyphSpaceToTextSpaceFactor);
+ //
+ //
+ // // PDF Spec - 5.5.2 Word Spacing
+ // //
+ // // Word spacing works the same was as character spacing, but applies
+ // // only to the space character, code 32.
+ // //
+ // // Note: Word spacing is applied to every occurrence of the single-byte
+ // // character code 32 in a string. This can occur when using a simple
+ // // font or a composite font that defines code 32 as a single-byte code.
+ // // It does not apply to occurrences of the byte value 32 in multiple-byte
+ // // codes.
+ // //
+ // // RDD - My interpretation of this is that only character code 32's that
+ // // encode to spaces should have word spacing applied. Cases have been
+ // // observed where a font has a space character with a character code
+ // // other than 32, and where word spacing (Tw) was used. In these cases,
+ // // applying word spacing to either the non-32 space or to the character
+ // // code 32 non-space resulted in errors consistent with this
+ // interpretation.
+ // //
+ //
+ // boolean withCS = false;
+ // if( (string[i] == 0x20) && c.equals( " " ) )
+ // {
+ // spacing += wordSpacing + characterSpacing;
+ // withCS = true;
+ // }
+ // else
+ // {
+ // spacing += characterSpacing;
+ // }
+ //
+ // if( log.isDebugEnabled() )
+ // {
+ // log.debug( "Checking code '" + c + "' font=" +
+ // graphicsState.getTextState().getFont() +
+ // " Tc=" + characterSpacing +
+ // " Tw=" + wordSpacing +
+ // " fontSize=" + fontSize +
+ // " horizontalScaling=" + horizontalScaling +
+ // " totalDisp=" + characterDisplacement +
+ // " spacing=" + spacing + "(" + withCS + ")" );
+ // }
+ // // We want to update the textMatrix using the width, in text space units.
+ // //
+ //
+ // }
+ //
+ // //The adjustment will always be zero. The adjustment as shown in the
+ // //TJ operator will be handled separately.
+ // float adjustment=0;
+ // //todo, need to compute the horizontal displacement
+ // float ty = 0;
+ // float tx =
+ // ((characterDisplacement-adjustment/glyphSpaceToTextSpaceFactor)*fontSize +
+ // spacing)
+ // *horizontalScaling;
+ //
+ // if( log.isDebugEnabled() )
+ // {
+ // log.debug( "disp=" + characterDisplacement + " adj=" + adjustment +
+ // " fSize=" + fontSize + " tx=" + tx );
+ // }
+ //
+ // float xScale = trm.getXScale();
+ // float yScale = trm.getYScale();
+ // float xPos = trm.getXPosition();
+ // float yPos = trm.getYPosition();
+ // spaceWidth = spaceDisplacement * xScale * fontSize;
+ // wordSpacingDisplacement = wordSpacing*xScale * fontSize;
+ // Matrix td = new Matrix();
+ // td.setValue( 2, 0, tx );
+ // td.setValue( 2, 1, ty );
+ //
+ // if( log.isDebugEnabled() )
+ // {
+ // log.debug( "TRM=" + trm );
+ // log.debug( "TextMatrix before " + textMatrix );
+ // }
+ // float xPosBefore = textMatrix.getXPosition();
+ // float yPosBefore = textMatrix.getYPosition();
+ // textMatrix = td.multiply( textMatrix );
+ // if( log.isDebugEnabled() )
+ // {
+ // log.debug( "TextMatrix after " + textMatrix );
+ // }
+ // float totalStringDisplacement = 0;
+ // if( pageRotation == 0 )
+ // {
+ // totalStringDisplacement = (textMatrix.getXPosition() - xPosBefore);
+ // }
+ // else if( pageRotation == 90 )
+ // {
+ // totalStringDisplacement = (textMatrix.getYPosition() - yPosBefore);
+ // }
+ // else if( pageRotation == 270 )
+ // {
+ // totalStringDisplacement = (yPosBefore - textMatrix.getYPosition());
+ // }
+ // showCharacter(
+ // new TextPosition(
+ // xPos,
+ // yPos,
+ // xScale,
+ // yScale,
+ // totalStringDisplacement,
+ // spaceWidth,
+ // stringResult.toString(),
+ // graphicsState.getTextState().getFont(),
+ // graphicsState.getTextState().getFontSize(),
+ // wordSpacingDisplacement ));
+ // }
+ //
+
+ protected void processOperator(PDFOperator operator, List arguments) throws IOException
+ {
+ logger_.debug("operator = " + operator);
+
+ super.processOperator(operator, arguments);
+ }
+
+ /**
+ * A method provided as an event interface to allow a subclass to perform some
+ * specific functionality when a character needs to be displayed. This method
+ * is used to calculate the latest position of a text in the page. Sorry for
+ * this missinterpretation of the method, but it is the only way to do this
+ * (provided by PDFBox)!!!
+ *
+ * @param text
+ * the character to be displayed -> calculate there y position.
+ */
+ protected void showCharacter(TextPosition text)
+ {
+ float current_y = text.getY();
+ String character = text.getCharacter();
+ // store ypos of the char if it is not empty
+ if (!character.equals(" ") && current_y > this.max_character_ypos)
+ {
+ this.max_character_ypos = current_y;
+ //logger_.debug("text.character=" + character + ", y=" + current_y);
+ // System.err.println(character + "|" + current_y);
+ }
+
+ // logger_.debug("text.character=" + character + ", y=" + current_y);
+ // System.err.println(character + "|" + current_y);
+ }
+
+ // use this funtion getting an unsorted text output
+ // public void showString(byte[] string) {
+ // logger_.debug(new String(string));
+ // }
+
+ /**
+ * Returns the calculated page length.
+ *
+ * @return the max page length value
+ */
+ public float getMaxPageLength()
+ {
+ float max_ypos = Float.NEGATIVE_INFINITY;
+
+ if (this.max_character_ypos > this.max_image_ypos)
+ {
+ max_ypos = this.max_character_ypos;
+ }
+ else
+ {
+ max_ypos = this.max_image_ypos;
+ }
+
+ return max_ypos;
+ }
+
+ public class MyInvoke extends OperatorProcessor
+ {
+
+ public void process(PDFOperator operator, List arguments) throws IOException
+ {
+ COSName name = (COSName) arguments.get(0);
+ logger_.debug("<Do name=\"" + name.getName() + "\">");
+
+ // PDResources res = context.getResources();
+
+ Map xobjects = context.getXObjects();
+ PDXObject xobject = (PDXObject) xobjects.get(name.getName());
+
+ PDStream stream = xobject.getPDStream();
+ COSStream cos_stream = stream.getStream();
+
+ COSName subtype = (COSName) cos_stream.getDictionaryObject(COSName.SUBTYPE);
+ if (subtype.equals(COSName.IMAGE))
+ {
+ logger_.debug("XObject Image");
+
+ Matrix ctm = context.getGraphicsState().getCurrentTransformationMatrix();
+ logger_.debug("ctm = " + ctm);
+
+ Pos [] coordinates = new Pos [] {
+ new Pos(0, 0, 1),
+ new Pos(1, 0, 1),
+ new Pos(0, 1, 1),
+ new Pos(1, 1, 1) };
+
+ Pos [] transformed_coordinates = transtormCoordinates(coordinates, ctm);
+
+ float actual_lowest_point = Float.NaN;
+
+ int pageRotation = page.findRotation();
+ logger_.debug("PageRotation = " + pageRotation);
+ if (pageRotation == 0)
+ {
+ float min_y = findMinY(transformed_coordinates);
+ logger_.debug("min_y = " + min_y);
+ float page_height = page.findMediaBox().getHeight();
+ logger_.debug("page_height = " + page_height);
+
+ actual_lowest_point = page_height - min_y;
+ }
+ if (pageRotation == 90)
+ {
+ float max_x = findMaxX(transformed_coordinates);
+ logger_.debug("max_x = " + max_x);
+// float page_width = page.findMediaBox().getWidth();
+// logger_.debug("page_width = " + page_width);
+
+ actual_lowest_point = max_x;
+ }
+ if (pageRotation == 180)
+ {
+ float min_y = findMinY(transformed_coordinates);
+ logger_.debug("min_y = " + min_y);
+ actual_lowest_point = min_y;
+ }
+ if (pageRotation == 270)
+ {
+ float min_x = findMinX(transformed_coordinates);
+ logger_.debug("min_x = " + min_x);
+// float page_width = page.findMediaBox().getWidth();
+// logger_.debug("page_width = " + page_width);
+
+ actual_lowest_point = min_x;
+ }
+
+
+ logger_.debug("actual_lowest_point = " + actual_lowest_point);
+
+ if (actual_lowest_point > PDFPage.this.max_image_ypos)
+ {
+ PDFPage.this.max_image_ypos = actual_lowest_point;
+ }
+
+ return;
+ }
+
+ if (xobject instanceof PDXObjectForm)
+ {
+ PDXObjectForm form = (PDXObjectForm) xobject;
+ COSStream invoke = (COSStream) form.getCOSObject();
+ PDResources pdResources = form.getResources();
+ PDPage page = context.getCurrentPage();
+ if (pdResources == null)
+ {
+ pdResources = page.findResources();
+ }
+
+ getContext().processSubStream(page, pdResources, invoke);
+ }
+ }
+ }
+
+ public static Pos [] transtormCoordinates (Pos [] coordinates, Matrix m)
+ {
+ Pos [] transformed = new Pos [coordinates.length];
+ for (int i = 0; i < coordinates.length; i++)
+ {
+ transformed[i] = transtormCoordinate(coordinates[i], m);
+ }
+ return transformed;
+ }
+
+ public static Pos transtormCoordinate (Pos pos, Matrix m)
+ {
+ Pos transformed = new Pos();
+ transformed.x = pos.x * m.getValue(0, 0) + pos.y * m.getValue(1, 0) + pos.z * m.getValue(2, 0);
+ transformed.y = pos.x * m.getValue(0, 1) + pos.y * m.getValue(1, 1) + pos.z * m.getValue(2, 1);
+ transformed.z = pos.x * m.getValue(0, 2) + pos.y * m.getValue(1, 2) + pos.z * m.getValue(2, 2);
+
+ logger_.debug(" transformed " + pos + " --> " + transformed);
+ return transformed;
+ }
+
+ public static float findMinY (Pos [] coordinates)
+ {
+ float min = Float.POSITIVE_INFINITY;
+ for (int i = 0; i < coordinates.length; i++)
+ {
+ if (coordinates[i].y < min)
+ {
+ min = coordinates[i].y;
+ }
+ }
+ return min;
+ }
+ public static float findMaxX (Pos [] coordinates)
+ {
+ float max = Float.NEGATIVE_INFINITY;
+ for (int i = 0; i < coordinates.length; i++)
+ {
+ if (coordinates[i].x > max)
+ {
+ max = coordinates[i].x;
+ }
+ }
+ return max;
+ }
+ public static float findMinX (Pos [] coordinates)
+ {
+ float min = Float.POSITIVE_INFINITY;
+ for (int i = 0; i < coordinates.length; i++)
+ {
+ if (coordinates[i].x < min)
+ {
+ min = coordinates[i].x;
+ }
+ }
+ return min;
+ }
+
+} \ No newline at end of file