1 files changed, 282 insertions, 0 deletions
diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
new file mode 100644
index 0000000..35a0768
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
@@ -0,0 +1,282 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ *
+ * $Id: TextualSignature.java,v 1.4 2006/10/31 08:12:45 wprinz Exp $
+ */
+package at.knowcenter.wag.egov.egiz.pdf;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.log4j.Logger;
+import org.pdfbox.pdfparser.PDFParser;
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.util.PDFTextStripper;
+
+import at.gv.egiz.pdfas.exceptions.ErrorCode;
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+import at.gv.egiz.pdfas.performance.PerformanceCounters;
+import at.gv.egiz.pdfas.utils.PDFASUtils;
+import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger;
+import at.knowcenter.wag.egov.egiz.cfg.SettingsReader;
+import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException;
+
+import com.lowagie.text.Document;
+import com.lowagie.text.DocumentException;
+import com.lowagie.text.Rectangle;
+import com.lowagie.text.pdf.PdfContentByte;
+import com.lowagie.text.pdf.PdfImportedPage;
+import com.lowagie.text.pdf.PdfReader;
+import com.lowagie.text.pdf.PdfWriter;
+
+/**
+ * Contains helper function for textual signatures.
+ * 
+ * @author wprinz
+ */
+public class TextualSignature
+{
+  /**
+  * The logger definition.
+  */
+  private static final Logger logger_ = ConfigLogger.getLogger(TextualSignature.class);
+  
+  /**
+   * Extracts the document text from a given pdf.
+   * 
+   * @param pdf_stream
+   *          The pdf_input stream.
+   * @return Returns the extracted document text.
+ * @throws PDFDocumentException 
+   * @throws TextExtractionException
+   *           Forwarded exception.
+   */
+  public static String extractTextTextual(PdfDataSource pdfDataSource, String encoding) throws PDFDocumentException 
+  {
+    PerformanceCounters.textExtractions.increment();
+    
+    try
+    {
+      int first_page_rotation = 0;
+      // logger_.debug("====================================================");
+      // logger_.debug("extractText:");
+
+      // For text extraction, create a temporary object with iText just as the
+      // one
+      // created
+      // when being signed, but of course without adding content.
+
+
+     // byte[] bytes = normalizePDF(pdf_stream);
+    	//iText  
+    	
+        byte [] pdf_data = pdfDataSource.getAsByteArray();
+        PdfReader reader = new PdfReader(pdf_data);
+        PDFASUtils.checkReaderPermissions(reader);
+        //pdf_stream.close();
+
+        // PERF: PDF normalization needs byte array - this is costy
+        ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
+
+        // For some reason the Reader -> ImportPage -> Writer mechanism produces
+        // problems en mass.
+        // The text extractor may not be able to extract proper text from
+        // documents
+        // created with
+        // this method (although it works when a Table is appended)... very
+        // fragile.
+
+        Document document = new Document();
+
+        PdfWriter writer = PdfWriter.getInstance(document, baos);
+        document.open();
+
+        PdfContentByte cb = writer.getDirectContent();
+        for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++)
+        {
+          //Rectangle new_size = reader.getPageSize(page_num);
+          //logger_.info("PageSize with no rotaion: Pagenr:"+page_num+" Size: "+new_size);
+          //document.setPageSize(new_size);
+          Rectangle new_size_withrot =reader.getPageSizeWithRotation(page_num);
+          if (page_num == 1)
+          {
+        	//setFirstPageRotation(new_size_withrot.getRotation());
+        	first_page_rotation = new_size_withrot.getRotation();
+        	//logger_.info("iText first_page_rotation="+new_size_withrot.getRotation());
+          }
+          //logger_.info("iText set PageSize of page:"+page_num+" to: "+new_size_withrot); 
+          //document.setPageSize(new_size);
+          document.setPageSize(new_size_withrot);
+          document.newPage();
+
+          PdfImportedPage page = writer.getImportedPage(reader, page_num);
+          // note that this will add an xobject form to the doc.
+          // the xobject form contains the content of the page.
+          cb.addTemplate(page, 0, 0);
+
+          // wprinz: debugging
+          // cb.beginText();
+          // cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA,
+          // BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14);
+          // cb.showText("page " + page_num);
+          // cb.endText();
+          // wprinz: end debugging
+        }
+
+        document.close();
+
+        // for (int i = 1; i <= reader.getNumberOfPages(); i++)
+        // {
+        // Rectangle rect = reader.getBoxSize(i, "bleed");
+        // logger_.debug("rect[" + i + "] = " + rect);
+        // }
+
+        baos.close();
+        byte[] normalizedPDF = baos.toByteArray();
+
+      ByteArrayInputStream bais = new ByteArrayInputStream(normalizedPDF);
+      //PDFBox-parser
+      PDFParser parser = new PDFParser(bais);
+      File temporary_dir = SettingsReader.getTemporaryDirectory();
+      //logger_.info("temporary_dir="+temporary_dir.getAbsolutePath());
+      parser.setTempDirectory(temporary_dir);
+      parser.parse();
+      
+      PDDocument doc = parser.getPDDocument();
+      //System.out.println("pdfBox.getNumberOfPages()"+doc.getNumberOfPages());
+      
+      PDFTextStripper stripper = new PDFTextStripper();
+      stripper.setSortByPosition(false);
+      stripper.setGetFirstPageRotationFromThis(true);
+      stripper.setFirstPageRotation(first_page_rotation);
+      
+      // stripper.setStartPage(4);
+      // stripper.setEndPage(4);
+      logger_.debug("TextualSignator extractTextTextual: Begin stripping text");
+      String text;
+      try {
+    	  text = stripper.getText(doc, encoding);
+      } catch (Exception e) {
+    	  throw new PDFDocumentException(ErrorCode.TEXT_EXTRACTION_EXCEPTION, "Unable to extract textual content.", e);
+      }
+      logger_.debug("TextualSignator extractTextTextual: Stripping text ended");
+      
+      doc.close();
+      //logger_.debug("TextualSignator extractTextTextual="+text);
+      return text;
+
+    }
+    catch (IllegalArgumentException e)
+    {
+       throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e);
+    }
+    catch (IOException e)
+    {
+      throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e);
+    }
+    catch (DocumentException e)
+    {
+       throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e);
+    }
+  }
+  
+  /**
+   * Normalizes a given binary PDF to a version PDFbox can handle correctly.
+   * 
+   * <p>
+   * PDFbox has serious problems with documents that use incremental updates or
+   * XObject forms. Therefor use this to remove incremental updates and create a
+   * streamlined document.
+   * </p>
+   * 
+   * <p>
+   * Note that this has nothing to do with text normalization. It just unifies
+   * the PDF documents that are fed into PDFbox for text extraction and page
+   * length determination.
+   * </p>
+   * 
+   * @param input_pdf
+   *          The input pdf to be normalized.
+   * @return Returns the normalized pdf.
+   * @throws IOException
+   * @throws DocumentException
+ * @throws PDFDocumentException 
+   */
+  public static byte[] normalizePDF(PdfDataSource pdfDataSource) throws IOException, DocumentException, PDFDocumentException
+  {
+	  //iText
+    byte [] pdf_data = pdfDataSource.getAsByteArray();
+    PdfReader reader = new PdfReader(pdf_data);
+    PDFASUtils.checkReaderPermissions(reader);
+    //input_pdf.close();
+    
+    // PERF: PDF Normalization needs byte array
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    // For some reason the Reader -> ImportPage -> Writer mechanism produces
+    // problems en mass.
+    // The text extractor may not be able to extract proper text from
+    // documents
+    // created with
+    // this method (although it works when a Table is appended)... very
+    // fragile.
+
+    Document document = new Document();
+
+    PdfWriter writer = PdfWriter.getInstance(document, baos);
+    document.open();
+
+    PdfContentByte cb = writer.getDirectContent();
+    for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++)
+    {
+      Rectangle new_size_withrot =reader.getPageSizeWithRotation(page_num);
+      document.setPageSize(new_size_withrot);
+      document.newPage();
+      PdfImportedPage page = writer.getImportedPage(reader, page_num);
+      // note that this will add an xobject form to the doc.
+      // the xobject form contains the content of the page.
+      cb.addTemplate(page, 0, 0);
+
+      // wprinz: debugging
+      // cb.beginText();
+      // cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA,
+      // BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14);
+      // cb.showText("page " + page_num);
+      // cb.endText();
+      // wprinz: end debugging
+    }
+
+    document.close();
+
+    // for (int i = 1; i <= reader.getNumberOfPages(); i++)
+    // {
+    // Rectangle rect = reader.getBoxSize(i, "bleed");
+    // logger_.debug("rect[" + i + "] = " + rect);
+    // }
+
+    baos.close();
+    byte[] normalizedPDF = baos.toByteArray();
+    return normalizedPDF;
+  }
+}