From c4e41301d0746ce57044a3aa41375cff3a9f2b5e Mon Sep 17 00:00:00 2001 From: Christian Maierhofer Date: Wed, 8 Jun 2016 08:12:21 +0200 Subject: initial pdfbox-2 commit --- .../wag/egov/egiz/pdfbox2/pdf/PDFPage.java | 672 +++++++++++++++++++++ 1 file changed, 672 insertions(+) create mode 100644 pdf-as-pdfbox-2/src/main/java/at/knowcenter/wag/egov/egiz/pdfbox2/pdf/PDFPage.java (limited to 'pdf-as-pdfbox-2/src/main/java/at/knowcenter/wag/egov/egiz/pdfbox2/pdf/PDFPage.java') diff --git a/pdf-as-pdfbox-2/src/main/java/at/knowcenter/wag/egov/egiz/pdfbox2/pdf/PDFPage.java b/pdf-as-pdfbox-2/src/main/java/at/knowcenter/wag/egov/egiz/pdfbox2/pdf/PDFPage.java new file mode 100644 index 00000000..a3e68c95 --- /dev/null +++ b/pdf-as-pdfbox-2/src/main/java/at/knowcenter/wag/egov/egiz/pdfbox2/pdf/PDFPage.java @@ -0,0 +1,672 @@ +/******************************************************************************* + * Copyright 2014 by E-Government Innovation Center EGIZ, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + ******************************************************************************/ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + * + * $Id: PDFPage.java,v 1.5 2006/10/31 08:09:33 wprinz Exp $ + */ +package at.knowcenter.wag.egov.egiz.pdfbox2.pdf; + +import java.awt.Rectangle; +import java.awt.geom.GeneralPath; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.math.NumberUtils; +import org.apache.pdfbox.contentstream.operator.Operator; +import org.apache.pdfbox.contentstream.operator.OperatorProcessor; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSStream; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.common.PDStream; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.graphics.PDXObject; +import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; +import org.apache.pdfbox.util.Matrix; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import at.knowcenter.wag.egov.egiz.pdf.Pos; +import at.knowcenter.wag.egov.egiz.pdfbox2.pdf.operator.path.construction.ClosePath; +import at.knowcenter.wag.egov.egiz.pdfbox2.pdf.operator.path.construction.CurveTo; +import at.knowcenter.wag.egov.egiz.pdfbox2.pdf.operator.path.construction.CurveToReplicateFinalPoint; +import at.knowcenter.wag.egov.egiz.pdfbox2.pdf.operator.path.construction.CurveToReplicateInitialPoint; +import at.knowcenter.wag.egov.egiz.pdfbox2.pdf.operator.path.construction.LineTo; +import at.knowcenter.wag.egov.egiz.pdfbox2.pdf.operator.path.construction.MoveTo; +import at.knowcenter.wag.egov.egiz.pdfbox2.pdf.operator.path.painting.CloseAndStrokePath; +import at.knowcenter.wag.egov.egiz.pdfbox2.pdf.operator.path.painting.CloseFillEvenOddAndStrokePath; +import at.knowcenter.wag.egov.egiz.pdfbox2.pdf.operator.path.painting.CloseFillNonZeroAndStrokePath; +import at.knowcenter.wag.egov.egiz.pdfbox2.pdf.operator.path.painting.EndPath; +import at.knowcenter.wag.egov.egiz.pdfbox2.pdf.operator.path.painting.FillEvenOddAndStrokePath; +import at.knowcenter.wag.egov.egiz.pdfbox2.pdf.operator.path.painting.FillNonZeroAndStrokePath; +import at.knowcenter.wag.egov.egiz.pdfbox2.pdf.operator.path.painting.FillPathEvenOddRule; +import at.knowcenter.wag.egov.egiz.pdfbox2.pdf.operator.path.painting.FillPathNonZeroWindingNumberRule; +import at.knowcenter.wag.egov.egiz.pdfbox2.pdf.operator.path.painting.StrokePath; + +/** + * PDFPage is an inner class that is used to calculate the page length of a PDF + * Document page. It extends the PDFTextStripper class and implement one + * interested method: + * {@link at.knowcenter.wag.egov.egiz.pdfbox2.pdf.PDFPage#showCharacter(TextPosition)}
+ * This method is called when processing the FileStream. By calling the method + * {@link org.apache.pdfbox.util.PDFStreamEngine#processStream(org.apache.pdfbox.pdmodel.PDPage, org.apache.pdfbox.pdmodel.PDResources, org.pdfbox.cos.COSStream)} + * the implemented method showCharacter is called. + * + * @author wlackner + * @see PDFTextStripper + */ +public class PDFPage extends PDFTextStripper{ + /** + * The logger definition. + */ + private static final Logger logger = LoggerFactory.getLogger(PDFPage.class); + + /** + * The maximum (lowest) y position of a character. + */ + protected float max_character_ypos = Float.NEGATIVE_INFINITY; + + /** + * The maximum (lowest y position of an image. + */ + protected float max_image_ypos = Float.NEGATIVE_INFINITY; + + /** + * The effective page height. + */ + protected float effectivePageHeight; + + /** + * The path currently being constructed. + */ + private GeneralPath currentPath = new GeneralPath(); + + private boolean legacy40; + + /** + * The lowest position of a drawn path (originating from top). + */ + private float maxPathRelatedYPositionFromTop = Float.NEGATIVE_INFINITY; + + /** + * Constructor. + * + * @param effectivePageHeight + * The height of the page to be evaluated. PDF elements outside + * this height will not be considered. + * + * @throws java.io.IOException + */ + public PDFPage(float effectivePageHeight, boolean legacy32, boolean legacy40) + throws IOException { + super(); + + this.legacy40 = legacy40; + + this.effectivePageHeight = effectivePageHeight; + + OperatorProcessor newInvoke = new MyInvoke(this); + newInvoke.setContext(this); + this.registerOperatorProcessor("Do", newInvoke); + + if (!legacy32) { + registerCustomPathOperators(); + } + } + + /** + * Registers operators responsible for path construction and painting in + * order to fix auto positioning on pages with path elements. + * + * @author Datentechnik Innovation GmbH + */ + private void registerCustomPathOperators() { + + // *** path construction + this.registerOperatorProcessor("m", new MoveTo(this)); + this.registerOperatorProcessor("l", new LineTo(this)); + this.registerOperatorProcessor("c", new CurveTo(this)); + this.registerOperatorProcessor("y", + new CurveToReplicateFinalPoint(this)); + this.registerOperatorProcessor("v", new CurveToReplicateInitialPoint( + this)); + this.registerOperatorProcessor("h", new ClosePath(this)); + + // *** path painting + + // "S": stroke path + this.registerOperatorProcessor("S", new StrokePath(this)); + this.registerOperatorProcessor("s", new CloseAndStrokePath(this)); + this.registerOperatorProcessor("f", + new FillPathNonZeroWindingNumberRule(this)); + this.registerOperatorProcessor("F", + new FillPathNonZeroWindingNumberRule(this)); + this.registerOperatorProcessor("f*", new FillPathEvenOddRule(this)); + this.registerOperatorProcessor("b", new CloseFillNonZeroAndStrokePath( + this)); + this.registerOperatorProcessor("B", new FillNonZeroAndStrokePath(this)); + this.registerOperatorProcessor("b*", new CloseFillEvenOddAndStrokePath( + this)); + this.registerOperatorProcessor("B*", new FillEvenOddAndStrokePath(this)); + this.registerOperatorProcessor("n", new EndPath(this)); + + // Note: The graphic context + // (org.pdfbox.pdmodel.graphics.PDGraphicsState) of the underlying + // pdfbox library does + // not yet support clipping. This prevents feasible usage of clipping + // operators (W, W*). + // operators.put("W", new ...(this)); + // operators.put("W*", new ...(this)); + + } + + /** + * Returns the path currently being constructed. + * + * @return The path currently being constructed. + */ + public GeneralPath getCurrentPath() { + return currentPath; + } + + /** + * Sets the current path. + * + * @param currentPath + * The new current path. + */ + public void setCurrentPath(GeneralPath currentPath) { + this.currentPath = currentPath; + } + + /** + * Registers a rectangle that bounds the path currently being drawn. + * + * @param bounds + * A rectangle depicting the bounds (coordinates originating from + * bottom left). + * @author Datentechnik Innovation GmbH + */ + public void registerPathBounds(Rectangle bounds) { + if (!bounds.isEmpty()) { + logger.debug("Registering path bounds: " + bounds); + + // vertical start of rectangle (counting from top of page) + float upperBoundYPositionFromTop; + + // vertical end of rectangle (counting from top of page) + // this depicts the current end of path-related page content + float lowerBoundYPositionFromTop; + + PDRectangle boundaryBox = this.getCurrentPage().getCropBox(); + + if (boundaryBox == null) { + boundaryBox = this.getCurrentPage().getMediaBox(); + } + + float pageHeight; + + switch (this.getCurrentPage().getRotation()) { + case 90: // CW + pageHeight = boundaryBox.getWidth(); + upperBoundYPositionFromTop = (float) bounds.getMinX(); + lowerBoundYPositionFromTop = (float) bounds.getMaxX(); + break; + case 180: + pageHeight = boundaryBox.getHeight(); + upperBoundYPositionFromTop = (float) bounds.getMinY(); + lowerBoundYPositionFromTop = (float) bounds.getMaxY(); + break; + case 270: // CCW + pageHeight = boundaryBox.getWidth(); + upperBoundYPositionFromTop = pageHeight + - (float) bounds.getMaxX(); + lowerBoundYPositionFromTop = pageHeight + - (float) bounds.getMinX(); + break; + default: + pageHeight = boundaryBox.getHeight(); + upperBoundYPositionFromTop = pageHeight + - (float) bounds.getMaxY(); + lowerBoundYPositionFromTop = pageHeight + - (float) bounds.getMinY(); + break; + } + + // new maximum ? + if (lowerBoundYPositionFromTop > maxPathRelatedYPositionFromTop) { + // Is the rectangle (at least partly) located above the footer + // line? + // (effective page height := page height - footer line) + if (upperBoundYPositionFromTop <= effectivePageHeight) { + // yes: update current end of path-related page content + maxPathRelatedYPositionFromTop = lowerBoundYPositionFromTop; + logger.trace("New max path related y position (from top): " + + maxPathRelatedYPositionFromTop); + } else { + // no: rectangle is fully located below the footer line -> + // ignore + logger.trace("Ignoring path bound below the footer line."); + } + } + } + } + + protected void processOperator(Operator operator, List arguments) + throws IOException { + logger.trace("operator = " + operator); + super.processOperator(operator, arguments); + } + + @Override + protected void processTextPosition(TextPosition text) { + showCharacter(text); + } + + // exthex + /** + * A method provided as an event interface to allow a subclass to perform + * some specific functionality when a character needs to be displayed. This + * method is used to calculate the latest position of a text in the page. + * Sorry for this missinterpretation of the method, but it is the only way + * to do this (provided by PDFBox)!!! + * + * @param text + * the character to be displayed -> calculate there y position. + */ + protected void showCharacter(TextPosition text) { + float current_y = text.getY(); + final String character = text.toString(); + + if (at.gv.egiz.pdfas.common.utils.StringUtils.whiteSpaceTrim(character) + .isEmpty()) { + return; + } + + int pageRotation = this.getCurrentPage().getRotation(); + // logger_.debug("PageRotation = " + pageRotation); + /*if (pageRotation == 0) { + current_y = text.getY(); + } + if (pageRotation == 90) { + current_y = text.getY(); + } + if (pageRotation == 180) { + current_y = text.getY(); + } + if (pageRotation == 270) { + current_y = text.getY(); + } + + if (current_y > this.effectivePageHeight) { + this.max_character_ypos = this.effectivePageHeight; + return; + } + + // store ypos of the char if it is not empty + if (current_y > this.max_character_ypos) { + this.max_character_ypos = current_y; + }*/ + + if (pageRotation == 0) { + current_y = text.getY(); + } + if (pageRotation == 90) { + current_y = text.getX(); + } + if (pageRotation == 180) { + float page_height = this.getCurrentPage().getMediaBox().getHeight(); + current_y = page_height - text.getY(); + } + if (pageRotation == 270) { + float page_height = this.getCurrentPage().getMediaBox().getHeight(); + current_y = page_height - text.getX(); + } + + if (current_y > this.effectivePageHeight) { + // logger_.debug("character is below footer_line. footer_line = " + + // this.footer_line + ", text.character=" + character + ", y=" + + // current_y); + return; + } + + // store ypos of the char if it is not empty + if (current_y > this.max_character_ypos) { + this.max_character_ypos = current_y; + } + } + + // use this funtion getting an unsorted text output + // public void showString(byte[] string) { + // logger_.debug(new String(string)); + // } + + /** + * Returns the calculated page length. + * + * @return the max page length value + */ + public float getMaxPageLength() { + if (logger.isDebugEnabled()) { + logger.debug("Determining page content length: text=" + + max_character_ypos + ", image=" + max_image_ypos + + ", path=" + maxPathRelatedYPositionFromTop); + } + return NumberUtils.max(max_character_ypos, max_image_ypos, + maxPathRelatedYPositionFromTop); + } + +// public Map getFonts() { //TODO: pdfbox2 - was @override +// +// COSBase fontObj = null; +// +// if (getCurrentPage().getResources() != null +// && getCurrentPage().getResources().getCOSObject() != null +// && getCurrentPage().getResources().getCOSObject() +// .getDictionaryObject(COSName.FONT) != null) { +// fontObj = getCurrentPage().getResources().getCOSObject() +// .getDictionaryObject(COSName.FONT); +// } +// +// Map fontMap = new HashMap(); +// +// Iterable fontNames = getCurrentPage().getResources().getFontNames(); +// for(COSName fn : fontNames){ +// fontMap.put(fn.getName(), getCurrentPage().getResources().getFont(fn)); +// } +// +// if (fontObj != null) { +// getCurrentPage().getResources().getCOSObject() +// .setItem(COSName.FONT, fontObj); +// } +// +// return fontMap; +// } + + public class MyInvoke extends OperatorProcessor { + + private PDFPage mypage; + + public MyInvoke(PDFPage page) { + this.mypage = page; + } + + public void process(Operator operator, List arguments) + throws IOException { + COSName name = (COSName) arguments.get(0); + + // PDResources res = context.getResources(); + + PDXObject xobject = context.getResources().getXObject(name); + + PDStream stream = xobject.getPDStream(); + COSStream cos_stream = stream.getStream(); + + COSName subtype = (COSName) cos_stream + .getDictionaryObject(COSName.SUBTYPE); + if (subtype.equals(COSName.IMAGE)) { + logger.debug("XObject Image"); + + Matrix ctm = context.getGraphicsState() + .getCurrentTransformationMatrix(); + logger.debug("ctm = " + ctm); + + Pos[] coordinates = new Pos[] { new Pos(0, 0, 1), + new Pos(1, 0, 1), new Pos(0, 1, 1), new Pos(1, 1, 1) }; + + Pos[] transformed_coordinates = transtormCoordinates( + coordinates, ctm); + + /********************************************************** + * pdf-as fix: calculating min and max point of an image to look + * where the signature should be placed fix solves problems with + * footer and images and placement of the signature in an image + * only pdf document + **********************************************************/ + + float actual_lowest_point = Float.NaN; + float actual_starting_point = Float.NaN; + + int pageRotation = this.mypage.getCurrentPage().getRotation(); + logger.debug("PageRotation = " + pageRotation); + if (pageRotation == 0) { + float min_y = findMinY(transformed_coordinates); + logger.debug("min_y = " + min_y); + float page_height = this.mypage.getCurrentPage() + .getMediaBox().getHeight(); + logger.debug("page_height = " + page_height); + + actual_lowest_point = page_height - min_y; + actual_starting_point = page_height + - findMaxY(transformed_coordinates); + } + if (pageRotation == 90) { + float max_x = findMaxX(transformed_coordinates); + logger.debug("max_x = " + max_x); + float page_width = this.mypage.getCurrentPage() + .getMediaBox().getWidth(); + logger.debug("page_width = " + page_width); + + actual_lowest_point = max_x; + actual_starting_point = findMinX(transformed_coordinates); + } + if (pageRotation == 180) { + float min_y = findMinY(transformed_coordinates); + logger.debug("min_y = " + min_y); + actual_lowest_point = findMaxY(transformed_coordinates); + actual_starting_point = actual_lowest_point + min_y; + } + if (pageRotation == 270) { + float min_x = findMinX(transformed_coordinates); + logger.debug("min_x = " + min_x); + + float page_width = this.mypage.getCurrentPage() + .getMediaBox().getWidth(); + logger.debug("page_width = " + page_width); + + actual_lowest_point = page_width - min_x; + actual_starting_point = page_width + - findMaxX(transformed_coordinates); + } + + logger.debug("actual_lowest_point = " + actual_lowest_point); + + if (actual_lowest_point > PDFPage.this.effectivePageHeight + && actual_starting_point > PDFPage.this.effectivePageHeight) { + logger.debug("image is below footer_line"); + return; + } + + if (actual_lowest_point > PDFPage.this.max_image_ypos) { + PDFPage.this.max_image_ypos = actual_lowest_point; + } + + return; + } + + if (xobject instanceof PDFormXObject) { + PDFormXObject form = (PDFormXObject) xobject; + COSStream invoke = (COSStream) form.getCOSObject(); + PDResources pdResources = form.getResources(); + PDPage page = context.getCurrentPage(); + if (pdResources == null) { + pdResources = page.getResources(); + } + + //getContext().processSubStream(page, pdResources, invoke);//TODO: pdfbox2 - correct? + getContext().processPage(page); + + } + } + + @Override + public String getName() { + // TODO Auto-generated method stub + return null; + } + } + + public static Pos[] transtormCoordinates(Pos[] coordinates, Matrix m) { + Pos[] transformed = new Pos[coordinates.length]; + for (int i = 0; i < coordinates.length; i++) { + transformed[i] = transtormCoordinate(coordinates[i], m); + } + return transformed; + } + + public static Pos transtormCoordinate(Pos pos, Matrix m) { + Pos transformed = new Pos(); + transformed.x = pos.x * m.getValue(0, 0) + pos.y * m.getValue(1, 0) + + pos.z * m.getValue(2, 0); + transformed.y = pos.x * m.getValue(0, 1) + pos.y * m.getValue(1, 1) + + pos.z * m.getValue(2, 1); + transformed.z = pos.x * m.getValue(0, 2) + pos.y * m.getValue(1, 2) + + pos.z * m.getValue(2, 2); + + logger.debug(" transformed " + pos + " --> " + transformed); + return transformed; + } + + public static float findMinY(Pos[] coordinates) { + float min = Float.POSITIVE_INFINITY; + for (int i = 0; i < coordinates.length; i++) { + if (coordinates[i].y < min) { + min = coordinates[i].y; + } + } + return min; + } + + public static float findMaxY(Pos[] coordinates) { + float max = 0; + for (int i = 0; i < coordinates.length; i++) { + if (coordinates[i].y > max) { + max = coordinates[i].y; + } + } + return max; + } + + public static float findMaxX(Pos[] coordinates) { + float max = Float.NEGATIVE_INFINITY; + for (int i = 0; i < coordinates.length; i++) { + if (coordinates[i].x > max) { + max = coordinates[i].x; + } + } + return max; + } + + public static float findMinX(Pos[] coordinates) { + float min = Float.POSITIVE_INFINITY; + for (int i = 0; i < coordinates.length; i++) { + if (coordinates[i].x < min) { + min = coordinates[i].x; + } + } + return min; + } + + public void processAnnotation(PDAnnotation anno) { + float current_y = anno.getRectangle().getLowerLeftY(); + float upper_y = 0; + PDPage page = anno.getPage(); + + if (page == null) { + page = getCurrentPage(); + } + + if (page == null) { + logger.warn("Annotation without page! The position might not be correct!"); + return; + } + + int pageRotation = page.getRotation(); + // logger_.debug("PageRotation = " + pageRotation); + if (pageRotation == 0) { + float page_height = page.getMediaBox().getHeight(); + current_y = page_height - anno.getRectangle().getLowerLeftY(); + upper_y = page_height - anno.getRectangle().getUpperRightY(); + } + if (pageRotation == 90) { + current_y = anno.getRectangle().getUpperRightX(); + upper_y = anno.getRectangle().getLowerLeftX(); + } + if (pageRotation == 180) { + current_y = anno.getRectangle().getUpperRightY(); + upper_y = anno.getRectangle().getLowerLeftY(); + } + if (pageRotation == 270) { + float page_width = page.getMediaBox().getWidth(); + current_y = page_width - anno.getRectangle().getLowerLeftX(); + upper_y = page_width - anno.getRectangle().getUpperRightX(); + } + + + + if (current_y > this.effectivePageHeight) { + if(!this.legacy40 && upper_y < this.effectivePageHeight) { + // Bottom of annotation is below footer line, + // but top of annotation is above footer line! + // so no place left on this page! + this.max_character_ypos = this.effectivePageHeight; + } + return; + } + + // store ypos of the char if it is not empty + if (current_y > this.max_character_ypos) { + this.max_character_ypos = current_y; + } + } + +} -- cgit v1.2.3