aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java')
-rw-r--r--src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java177
1 files changed, 177 insertions, 0 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
new file mode 100644
index 0000000..140a6c3
--- /dev/null
+++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
@@ -0,0 +1,177 @@
+/**
+ * <copyright> Copyright (c) 2006 by Know-Center, Graz, Austria </copyright>
+ *
+ * This software is the confidential and proprietary information of Know-Center,
+ * Graz, Austria. You shall not disclose such Confidential Information and shall
+ * use it only in accordance with the terms of the license agreement you entered
+ * into with Know-Center.
+ *
+ * KNOW-CENTER MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
+ * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
+ * NON-INFRINGEMENT. KNOW-CENTER SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY
+ * LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS
+ * DERIVATIVES.
+ *
+ * $Id: TextualSignature.java,v 1.4 2006/10/31 08:12:45 wprinz Exp $
+ */
+package at.knowcenter.wag.egov.egiz.pdf;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.pdfbox.pdfparser.PDFParser;
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.util.PDFTextStripper;
+
+import at.knowcenter.wag.egov.egiz.cfg.SettingsReader;
+import at.knowcenter.wag.egov.egiz.exceptions.PresentableException;
+
+import com.lowagie.text.Document;
+import com.lowagie.text.DocumentException;
+import com.lowagie.text.Rectangle;
+import com.lowagie.text.pdf.PdfContentByte;
+import com.lowagie.text.pdf.PdfImportedPage;
+import com.lowagie.text.pdf.PdfReader;
+import com.lowagie.text.pdf.PdfWriter;
+
+/**
+ * Contains helper function for textual signatures.
+ *
+ * @author wprinz
+ */
+public class TextualSignature
+{
+
+ /**
+ * Extracts the document text from a given pdf.
+ *
+ * @param pdf_stream
+ * The pdf_input stream.
+ * @return Returns the extracted document text.
+ * @throws PresentableException
+ * Forwarded exception.
+ */
+ public static String extractTextTextual(InputStream pdf_stream) throws PresentableException
+ {
+ try
+ {
+ // logger_.debug("====================================================");
+ // logger_.debug("extractText:");
+
+ // For text extraction, create a temporary object with iText just as the
+ // one
+ // created
+ // when being signed, but of course without adding content.
+
+
+ byte[] bytes = normalizePDF(pdf_stream);
+
+ ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
+
+ PDFParser parser = new PDFParser(bais);
+ File temporary_dir = SettingsReader.getTemporaryDirectory();
+ parser.setTempDirectory(temporary_dir);
+ parser.parse();
+
+ PDDocument doc = parser.getPDDocument();
+
+ PDFTextStripper stripper = new PDFTextStripper();
+ stripper.setSortByPosition(false);
+ // stripper.setStartPage(4);
+ // stripper.setEndPage(4);
+ String text = stripper.getText(doc);
+
+ doc.close();
+
+ // logger_.debug("text.length = " + text.length());
+ // logger_.debug("====================================================");
+
+ return text;
+
+ }
+ catch (Exception e)
+ {
+ throw new PresentableException(e);
+ }
+ }
+
+ /**
+ * Normalizes a given binary PDF to a version PDFbox can handle correctly.
+ *
+ * <p>
+ * PDFbox has serious problems with documents that use incremental updates or
+ * XObject forms. Therefor use this to remove incremental updates and create a
+ * streamlined document.
+ * </p>
+ *
+ * <p>
+ * Note that this has nothing to do with text normalization. It just unifies
+ * the PDF documents that are fed into PDFbox for text extraction and page
+ * length determination.
+ * </p>
+ *
+ * @param input_pdf
+ * The input pdf to be normalized.
+ * @return Returns the normalized pdf.
+ * @throws IOException
+ * @throws DocumentException
+ */
+ public static byte[] normalizePDF(InputStream input_pdf) throws IOException, DocumentException
+ {
+ PdfReader reader = new PdfReader(input_pdf);
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+
+ // For some reason the Reader -> ImportPage -> Writer mechanism produces
+ // problems en mass.
+ // The text extractor may not be able to extract proper text from
+ // documents
+ // created with
+ // this method (although it works when a Table is appended)... very
+ // fragile.
+
+ Document document = new Document();
+
+ PdfWriter writer = PdfWriter.getInstance(document, baos);
+ document.open();
+
+ PdfContentByte cb = writer.getDirectContent();
+ for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++)
+ {
+ Rectangle new_size = reader.getPageSize(page_num);
+ document.setPageSize(new_size);
+ document.newPage();
+
+ PdfImportedPage page = writer.getImportedPage(reader, page_num);
+
+ // note that this will add an xobject form to the doc.
+ // the xobject form contains the content of the page.
+ cb.addTemplate(page, 0, 0);
+
+ // wprinz: debugging
+ // cb.beginText();
+ // cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA,
+ // BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14);
+ // cb.showText("page " + page_num);
+ // cb.endText();
+ // wprinz: end debugging
+ }
+
+ document.close();
+
+ // for (int i = 1; i <= reader.getNumberOfPages(); i++)
+ // {
+ // Rectangle rect = reader.getBoxSize(i, "bleed");
+ // logger_.debug("rect[" + i + "] = " + rect);
+ // }
+
+ baos.close();
+ byte[] normalizedPDF = baos.toByteArray();
+
+ return normalizedPDF;
+ }
+}