1 files changed, 177 insertions, 0 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
new file mode 100644
index 0000000..140a6c3
--- /dev/null
+++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
@@ -0,0 +1,177 @@
+/**
+ * <copyright> Copyright (c) 2006 by Know-Center, Graz, Austria </copyright>
+ * 
+ * This software is the confidential and proprietary information of Know-Center,
+ * Graz, Austria. You shall not disclose such Confidential Information and shall
+ * use it only in accordance with the terms of the license agreement you entered
+ * into with Know-Center.
+ * 
+ * KNOW-CENTER MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
+ * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
+ * NON-INFRINGEMENT. KNOW-CENTER SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY
+ * LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS
+ * DERIVATIVES.
+ * 
+ * $Id: TextualSignature.java,v 1.4 2006/10/31 08:12:45 wprinz Exp $
+ */
+package at.knowcenter.wag.egov.egiz.pdf;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.pdfbox.pdfparser.PDFParser;
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.util.PDFTextStripper;
+
+import at.knowcenter.wag.egov.egiz.cfg.SettingsReader;
+import at.knowcenter.wag.egov.egiz.exceptions.PresentableException;
+
+import com.lowagie.text.Document;
+import com.lowagie.text.DocumentException;
+import com.lowagie.text.Rectangle;
+import com.lowagie.text.pdf.PdfContentByte;
+import com.lowagie.text.pdf.PdfImportedPage;
+import com.lowagie.text.pdf.PdfReader;
+import com.lowagie.text.pdf.PdfWriter;
+
+/**
+ * Contains helper function for textual signatures.
+ * 
+ * @author wprinz
+ */
+public class TextualSignature
+{
+
+  /**
+   * Extracts the document text from a given pdf.
+   * 
+   * @param pdf_stream
+   *          The pdf_input stream.
+   * @return Returns the extracted document text.
+   * @throws PresentableException
+   *           Forwarded exception.
+   */
+  public static String extractTextTextual(InputStream pdf_stream) throws PresentableException
+  {
+    try
+    {
+      // logger_.debug("====================================================");
+      // logger_.debug("extractText:");
+
+      // For text extraction, create a temporary object with iText just as the
+      // one
+      // created
+      // when being signed, but of course without adding content.
+
+
+      byte[] bytes = normalizePDF(pdf_stream);
+
+      ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
+
+      PDFParser parser = new PDFParser(bais);
+      File temporary_dir = SettingsReader.getTemporaryDirectory();
+      parser.setTempDirectory(temporary_dir);
+      parser.parse();
+
+      PDDocument doc = parser.getPDDocument();
+
+      PDFTextStripper stripper = new PDFTextStripper();
+      stripper.setSortByPosition(false);
+      // stripper.setStartPage(4);
+      // stripper.setEndPage(4);
+      String text = stripper.getText(doc);
+
+      doc.close();
+
+      // logger_.debug("text.length = " + text.length());
+      // logger_.debug("====================================================");
+
+      return text;
+
+    }
+    catch (Exception e)
+    {
+      throw new PresentableException(e);
+    }
+  }
+
+  /**
+   * Normalizes a given binary PDF to a version PDFbox can handle correctly.
+   * 
+   * <p>
+   * PDFbox has serious problems with documents that use incremental updates or
+   * XObject forms. Therefor use this to remove incremental updates and create a
+   * streamlined document.
+   * </p>
+   * 
+   * <p>
+   * Note that this has nothing to do with text normalization. It just unifies
+   * the PDF documents that are fed into PDFbox for text extraction and page
+   * length determination.
+   * </p>
+   * 
+   * @param input_pdf
+   *          The input pdf to be normalized.
+   * @return Returns the normalized pdf.
+   * @throws IOException
+   * @throws DocumentException
+   */
+  public static byte[] normalizePDF(InputStream input_pdf) throws IOException, DocumentException
+  {
+    PdfReader reader = new PdfReader(input_pdf);
+
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+
+    // For some reason the Reader -> ImportPage -> Writer mechanism produces
+    // problems en mass.
+    // The text extractor may not be able to extract proper text from
+    // documents
+    // created with
+    // this method (although it works when a Table is appended)... very
+    // fragile.
+
+    Document document = new Document();
+
+    PdfWriter writer = PdfWriter.getInstance(document, baos);
+    document.open();
+
+    PdfContentByte cb = writer.getDirectContent();
+    for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++)
+    {
+      Rectangle new_size = reader.getPageSize(page_num);
+      document.setPageSize(new_size);
+      document.newPage();
+
+      PdfImportedPage page = writer.getImportedPage(reader, page_num);
+
+      // note that this will add an xobject form to the doc.
+      // the xobject form contains the content of the page.
+      cb.addTemplate(page, 0, 0);
+
+      // wprinz: debugging
+      // cb.beginText();
+      // cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA,
+      // BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14);
+      // cb.showText("page " + page_num);
+      // cb.endText();
+      // wprinz: end debugging
+    }
+
+    document.close();
+
+    // for (int i = 1; i <= reader.getNumberOfPages(); i++)
+    // {
+    // Rectangle rect = reader.getBoxSize(i, "bleed");
+    // logger_.debug("rect[" + i + "] = " + rect);
+    // }
+
+    baos.close();
+    byte[] normalizedPDF = baos.toByteArray();
+
+    return normalizedPDF;
+  }
+}