From 6025b6016517c6d898d8957d1d7e03ba71431912 Mon Sep 17 00:00:00 2001 From: tknall Date: Fri, 1 Dec 2006 12:20:24 +0000 Subject: Initial import of release 2.2. git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@4 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../wag/egov/egiz/pdf/TextualSignature.java | 177 +++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java (limited to 'src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java') diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java new file mode 100644 index 0000000..140a6c3 --- /dev/null +++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java @@ -0,0 +1,177 @@ +/** + * Copyright (c) 2006 by Know-Center, Graz, Austria + * + * This software is the confidential and proprietary information of Know-Center, + * Graz, Austria. You shall not disclose such Confidential Information and shall + * use it only in accordance with the terms of the license agreement you entered + * into with Know-Center. + * + * KNOW-CENTER MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF + * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR + * NON-INFRINGEMENT. KNOW-CENTER SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY + * LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS + * DERIVATIVES. + * + * $Id: TextualSignature.java,v 1.4 2006/10/31 08:12:45 wprinz Exp $ + */ +package at.knowcenter.wag.egov.egiz.pdf; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; + +import org.pdfbox.pdfparser.PDFParser; +import org.pdfbox.pdmodel.PDDocument; +import org.pdfbox.util.PDFTextStripper; + +import at.knowcenter.wag.egov.egiz.cfg.SettingsReader; +import at.knowcenter.wag.egov.egiz.exceptions.PresentableException; + +import com.lowagie.text.Document; +import com.lowagie.text.DocumentException; +import com.lowagie.text.Rectangle; +import com.lowagie.text.pdf.PdfContentByte; +import com.lowagie.text.pdf.PdfImportedPage; +import com.lowagie.text.pdf.PdfReader; +import com.lowagie.text.pdf.PdfWriter; + +/** + * Contains helper function for textual signatures. + * + * @author wprinz + */ +public class TextualSignature +{ + + /** + * Extracts the document text from a given pdf. + * + * @param pdf_stream + * The pdf_input stream. + * @return Returns the extracted document text. + * @throws PresentableException + * Forwarded exception. + */ + public static String extractTextTextual(InputStream pdf_stream) throws PresentableException + { + try + { + // logger_.debug("===================================================="); + // logger_.debug("extractText:"); + + // For text extraction, create a temporary object with iText just as the + // one + // created + // when being signed, but of course without adding content. + + + byte[] bytes = normalizePDF(pdf_stream); + + ByteArrayInputStream bais = new ByteArrayInputStream(bytes); + + PDFParser parser = new PDFParser(bais); + File temporary_dir = SettingsReader.getTemporaryDirectory(); + parser.setTempDirectory(temporary_dir); + parser.parse(); + + PDDocument doc = parser.getPDDocument(); + + PDFTextStripper stripper = new PDFTextStripper(); + stripper.setSortByPosition(false); + // stripper.setStartPage(4); + // stripper.setEndPage(4); + String text = stripper.getText(doc); + + doc.close(); + + // logger_.debug("text.length = " + text.length()); + // logger_.debug("===================================================="); + + return text; + + } + catch (Exception e) + { + throw new PresentableException(e); + } + } + + /** + * Normalizes a given binary PDF to a version PDFbox can handle correctly. + * + *

+ * PDFbox has serious problems with documents that use incremental updates or + * XObject forms. Therefor use this to remove incremental updates and create a + * streamlined document. + *

+ * + *

+ * Note that this has nothing to do with text normalization. It just unifies + * the PDF documents that are fed into PDFbox for text extraction and page + * length determination. + *

+ * + * @param input_pdf + * The input pdf to be normalized. + * @return Returns the normalized pdf. + * @throws IOException + * @throws DocumentException + */ + public static byte[] normalizePDF(InputStream input_pdf) throws IOException, DocumentException + { + PdfReader reader = new PdfReader(input_pdf); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + + // For some reason the Reader -> ImportPage -> Writer mechanism produces + // problems en mass. + // The text extractor may not be able to extract proper text from + // documents + // created with + // this method (although it works when a Table is appended)... very + // fragile. + + Document document = new Document(); + + PdfWriter writer = PdfWriter.getInstance(document, baos); + document.open(); + + PdfContentByte cb = writer.getDirectContent(); + for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++) + { + Rectangle new_size = reader.getPageSize(page_num); + document.setPageSize(new_size); + document.newPage(); + + PdfImportedPage page = writer.getImportedPage(reader, page_num); + + // note that this will add an xobject form to the doc. + // the xobject form contains the content of the page. + cb.addTemplate(page, 0, 0); + + // wprinz: debugging + // cb.beginText(); + // cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA, + // BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14); + // cb.showText("page " + page_num); + // cb.endText(); + // wprinz: end debugging + } + + document.close(); + + // for (int i = 1; i <= reader.getNumberOfPages(); i++) + // { + // Rectangle rect = reader.getBoxSize(i, "bleed"); + // logger_.debug("rect[" + i + "] = " + rect); + // } + + baos.close(); + byte[] normalizedPDF = baos.toByteArray(); + + return normalizedPDF; + } +} -- cgit v1.2.3