From 6025b6016517c6d898d8957d1d7e03ba71431912 Mon Sep 17 00:00:00 2001 From: tknall Date: Fri, 1 Dec 2006 12:20:24 +0000 Subject: Initial import of release 2.2. git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@4 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../at/knowcenter/wag/egov/egiz/pdf/PDFPage.java | 539 +++++++++++++++++++++ 1 file changed, 539 insertions(+) create mode 100644 src/main/java/at/knowcenter/wag/egov/egiz/pdf/PDFPage.java (limited to 'src/main/java/at/knowcenter/wag/egov/egiz/pdf/PDFPage.java') diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/PDFPage.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/PDFPage.java new file mode 100644 index 0000000..bed1b65 --- /dev/null +++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/PDFPage.java @@ -0,0 +1,539 @@ +/** + * Copyright (c) 2006 by Know-Center, Graz, Austria + * + * This software is the confidential and proprietary information of Know-Center, + * Graz, Austria. You shall not disclose such Confidential Information and shall + * use it only in accordance with the terms of the license agreement you entered + * into with Know-Center. + * + * KNOW-CENTER MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF + * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR + * NON-INFRINGEMENT. KNOW-CENTER SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY + * LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS + * DERIVATIVES. + * + * $Id: PDFPage.java,v 1.5 2006/10/31 08:09:33 wprinz Exp $ + */ +package at.knowcenter.wag.egov.egiz.pdf; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import org.apache.log4j.Logger; +import org.pdfbox.cos.COSName; +import org.pdfbox.cos.COSStream; +import org.pdfbox.pdmodel.PDPage; +import org.pdfbox.pdmodel.PDResources; +import org.pdfbox.pdmodel.common.PDStream; +import org.pdfbox.pdmodel.graphics.xobject.PDXObject; +import org.pdfbox.pdmodel.graphics.xobject.PDXObjectForm; +import org.pdfbox.util.Matrix; +import org.pdfbox.util.PDFOperator; +import org.pdfbox.util.PDFTextStripper; +import org.pdfbox.util.TextPosition; +import org.pdfbox.util.operator.OperatorProcessor; + +import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger; + +/** + * PDFPage is an inner class that is used to calculate the page length of a PDF + * Document page. It extends the PDFTextStripper class and implement one + * interested method: {@link PDFPage#showCharacter(TextPosition)}
+ * This method is called when processing the FileStream. By calling the method + * {@link org.pdfbox.util.PDFStreamEngine#processStream(org.pdfbox.pdmodel.PDPage, org.pdfbox.pdmodel.PDResources, org.pdfbox.cos.COSStream)} + * the implemented method showCharacter is called. + * + * @author wlackner + * @see PDFTextStripper + */ +public class PDFPage extends PDFTextStripper +{ + /** + * The logger definition. + */ + private static final Logger logger_ = ConfigLogger.getLogger(PDFPage.class); + + /** + * The maximum (lowest) y position of a character. + */ + protected float max_character_ypos = Float.NEGATIVE_INFINITY; + + /** + * The maximum (lowest y position of an image. + */ + protected float max_image_ypos = Float.NEGATIVE_INFINITY; + + /** + * The empty constructor. + * + * @throws IOException + */ + public PDFPage() throws IOException + { + super(); + + OperatorProcessor newInvoke = new MyInvoke(); + newInvoke.setContext(this); + operators.put("Do", newInvoke); + } + + // /** + // * You should override this method if you want to perform an action when a + // * string is being shown. + // * + // * @param string The string to display. + // * + // * @throws IOException If there is an error showing the string + // */ + // public void showString( byte[] string ) throws IOException + // { + // float spaceWidth = 0; + // float spacing = 0; + // StringBuffer stringResult = new StringBuffer(string.length); + // + // float characterDisplacement = 0; + // float spaceDisplacement = 0; + // + // PDGraphicsState graphicsState = getGraphicsState(); + // float fontSize = graphicsState.getTextState().getFontSize(); + // float horizontalScaling = + // graphicsState.getTextState().getHorizontalScalingPercent()/100f; + // float rise = graphicsState.getTextState().getRise(); + // final float wordSpacing = graphicsState.getTextState().getWordSpacing(); + // final float characterSpacing = + // graphicsState.getTextState().getCharacterSpacing(); + // float wordSpacingDisplacement = 0; + // + // PDFont font = graphicsState.getTextState().getFont(); + // + // //This will typically be 1000 but in the case of a type3 font + // //this might be a different number + // float glyphSpaceToTextSpaceFactor = 1f/font.getFontMatrix().getValue( 0, 0 + // ); + // Float averageWidth = (Float)fontToAverageWidths.get( font ); + // if( averageWidth == null ) + // { + // averageWidth = new Float( font.getAverageFontWidth() ); + // fontToAverageWidths.put( font, averageWidth ); + // } + // + // Matrix initialMatrix = new Matrix(); + // initialMatrix.setValue(0,0,1); + // initialMatrix.setValue(0,1,0); + // initialMatrix.setValue(0,2,0); + // initialMatrix.setValue(1,0,0); + // initialMatrix.setValue(1,1,1); + // initialMatrix.setValue(1,2,0); + // initialMatrix.setValue(2,0,0); + // initialMatrix.setValue(2,1,rise); + // initialMatrix.setValue(2,2,1); + // + // + // //this + // int codeLength = 1; + // Matrix ctm = graphicsState.getCurrentTransformationMatrix(); + // + // //lets see what the space displacement should be + // spaceDisplacement = (font.getFontWidth( SPACE_BYTES, 0, 1 + // )/glyphSpaceToTextSpaceFactor); + // if( spaceDisplacement == 0 ) + // { + // spaceDisplacement = + // (averageWidth.floatValue()/glyphSpaceToTextSpaceFactor); + // //The average space width appears to be higher than necessary + // //so lets make it a little bit smaller. + // spaceDisplacement *= .80f; + // if( log.isDebugEnabled() ) + // { + // log.debug( "Font: Space From Average=" + spaceDisplacement ); + // } + // } + // int pageRotation = page.findRotation(); + // + // // very strange.... the ctms are multiplied by right, but suddenly the + // textM is multiplied from the left. + // // but: PDF matrices are multiplied from left ==> ctm is wrong + // Matrix trm = initialMatrix.multiply( textMatrix ).multiply( ctm ); + // float x = trm.getValue(2,0); + // float y = trm.getValue(2,1); + // float flipped_y = -y + page.findMediaBox().getHeight(); + // if( pageRotation == 0 ) + // { + // trm.setValue( 2,1, flipped_y ); + // } + // else if( pageRotation == 90 ) + // { + // trm.setValue( 2,0, y ); + // trm.setValue( 2,1, x ); + // } + // else if( pageRotation == 270 ) + // { + // trm.setValue( 2,0, flipped_y ); + // trm.setValue( 2,1, x ); + // } + // for( int i=0; i this.max_character_ypos) + { + this.max_character_ypos = current_y; + //logger_.debug("text.character=" + character + ", y=" + current_y); + // System.err.println(character + "|" + current_y); + } + + // logger_.debug("text.character=" + character + ", y=" + current_y); + // System.err.println(character + "|" + current_y); + } + + // use this funtion getting an unsorted text output + // public void showString(byte[] string) { + // logger_.debug(new String(string)); + // } + + /** + * Returns the calculated page length. + * + * @return the max page length value + */ + public float getMaxPageLength() + { + float max_ypos = Float.NEGATIVE_INFINITY; + + if (this.max_character_ypos > this.max_image_ypos) + { + max_ypos = this.max_character_ypos; + } + else + { + max_ypos = this.max_image_ypos; + } + + return max_ypos; + } + + public class MyInvoke extends OperatorProcessor + { + + public void process(PDFOperator operator, List arguments) throws IOException + { + COSName name = (COSName) arguments.get(0); + logger_.debug(""); + + // PDResources res = context.getResources(); + + Map xobjects = context.getXObjects(); + PDXObject xobject = (PDXObject) xobjects.get(name.getName()); + + PDStream stream = xobject.getPDStream(); + COSStream cos_stream = stream.getStream(); + + COSName subtype = (COSName) cos_stream.getDictionaryObject(COSName.SUBTYPE); + if (subtype.equals(COSName.IMAGE)) + { + logger_.debug("XObject Image"); + + Matrix ctm = context.getGraphicsState().getCurrentTransformationMatrix(); + logger_.debug("ctm = " + ctm); + + Pos [] coordinates = new Pos [] { + new Pos(0, 0, 1), + new Pos(1, 0, 1), + new Pos(0, 1, 1), + new Pos(1, 1, 1) }; + + Pos [] transformed_coordinates = transtormCoordinates(coordinates, ctm); + + float actual_lowest_point = Float.NaN; + + int pageRotation = page.findRotation(); + logger_.debug("PageRotation = " + pageRotation); + if (pageRotation == 0) + { + float min_y = findMinY(transformed_coordinates); + logger_.debug("min_y = " + min_y); + float page_height = page.findMediaBox().getHeight(); + logger_.debug("page_height = " + page_height); + + actual_lowest_point = page_height - min_y; + } + if (pageRotation == 90) + { + float max_x = findMaxX(transformed_coordinates); + logger_.debug("max_x = " + max_x); +// float page_width = page.findMediaBox().getWidth(); +// logger_.debug("page_width = " + page_width); + + actual_lowest_point = max_x; + } + if (pageRotation == 180) + { + float min_y = findMinY(transformed_coordinates); + logger_.debug("min_y = " + min_y); + actual_lowest_point = min_y; + } + if (pageRotation == 270) + { + float min_x = findMinX(transformed_coordinates); + logger_.debug("min_x = " + min_x); +// float page_width = page.findMediaBox().getWidth(); +// logger_.debug("page_width = " + page_width); + + actual_lowest_point = min_x; + } + + + logger_.debug("actual_lowest_point = " + actual_lowest_point); + + if (actual_lowest_point > PDFPage.this.max_image_ypos) + { + PDFPage.this.max_image_ypos = actual_lowest_point; + } + + return; + } + + if (xobject instanceof PDXObjectForm) + { + PDXObjectForm form = (PDXObjectForm) xobject; + COSStream invoke = (COSStream) form.getCOSObject(); + PDResources pdResources = form.getResources(); + PDPage page = context.getCurrentPage(); + if (pdResources == null) + { + pdResources = page.findResources(); + } + + getContext().processSubStream(page, pdResources, invoke); + } + } + } + + public static Pos [] transtormCoordinates (Pos [] coordinates, Matrix m) + { + Pos [] transformed = new Pos [coordinates.length]; + for (int i = 0; i < coordinates.length; i++) + { + transformed[i] = transtormCoordinate(coordinates[i], m); + } + return transformed; + } + + public static Pos transtormCoordinate (Pos pos, Matrix m) + { + Pos transformed = new Pos(); + transformed.x = pos.x * m.getValue(0, 0) + pos.y * m.getValue(1, 0) + pos.z * m.getValue(2, 0); + transformed.y = pos.x * m.getValue(0, 1) + pos.y * m.getValue(1, 1) + pos.z * m.getValue(2, 1); + transformed.z = pos.x * m.getValue(0, 2) + pos.y * m.getValue(1, 2) + pos.z * m.getValue(2, 2); + + logger_.debug(" transformed " + pos + " --> " + transformed); + return transformed; + } + + public static float findMinY (Pos [] coordinates) + { + float min = Float.POSITIVE_INFINITY; + for (int i = 0; i < coordinates.length; i++) + { + if (coordinates[i].y < min) + { + min = coordinates[i].y; + } + } + return min; + } + public static float findMaxX (Pos [] coordinates) + { + float max = Float.NEGATIVE_INFINITY; + for (int i = 0; i < coordinates.length; i++) + { + if (coordinates[i].x > max) + { + max = coordinates[i].x; + } + } + return max; + } + public static float findMinX (Pos [] coordinates) + { + float min = Float.POSITIVE_INFINITY; + for (int i = 0; i < coordinates.length; i++) + { + if (coordinates[i].x < min) + { + min = coordinates[i].x; + } + } + return min; + } + +} \ No newline at end of file -- cgit v1.2.3