aboutsummaryrefslogtreecommitdiff
path: root/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
diff options
context:
space:
mode:
Diffstat (limited to 'pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java')
-rw-r--r--pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java282
1 files changed, 282 insertions, 0 deletions
diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
new file mode 100644
index 0000000..35a0768
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
@@ -0,0 +1,282 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ *
+ * $Id: TextualSignature.java,v 1.4 2006/10/31 08:12:45 wprinz Exp $
+ */
+package at.knowcenter.wag.egov.egiz.pdf;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.log4j.Logger;
+import org.pdfbox.pdfparser.PDFParser;
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.util.PDFTextStripper;
+
+import at.gv.egiz.pdfas.exceptions.ErrorCode;
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+import at.gv.egiz.pdfas.performance.PerformanceCounters;
+import at.gv.egiz.pdfas.utils.PDFASUtils;
+import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger;
+import at.knowcenter.wag.egov.egiz.cfg.SettingsReader;
+import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException;
+
+import com.lowagie.text.Document;
+import com.lowagie.text.DocumentException;
+import com.lowagie.text.Rectangle;
+import com.lowagie.text.pdf.PdfContentByte;
+import com.lowagie.text.pdf.PdfImportedPage;
+import com.lowagie.text.pdf.PdfReader;
+import com.lowagie.text.pdf.PdfWriter;
+
+/**
+ * Contains helper function for textual signatures.
+ *
+ * @author wprinz
+ */
+public class TextualSignature
+{
+ /**
+ * The logger definition.
+ */
+ private static final Logger logger_ = ConfigLogger.getLogger(TextualSignature.class);
+
+ /**
+ * Extracts the document text from a given pdf.
+ *
+ * @param pdf_stream
+ * The pdf_input stream.
+ * @return Returns the extracted document text.
+ * @throws PDFDocumentException
+ * @throws TextExtractionException
+ * Forwarded exception.
+ */
+ public static String extractTextTextual(PdfDataSource pdfDataSource, String encoding) throws PDFDocumentException
+ {
+ PerformanceCounters.textExtractions.increment();
+
+ try
+ {
+ int first_page_rotation = 0;
+ // logger_.debug("====================================================");
+ // logger_.debug("extractText:");
+
+ // For text extraction, create a temporary object with iText just as the
+ // one
+ // created
+ // when being signed, but of course without adding content.
+
+
+ // byte[] bytes = normalizePDF(pdf_stream);
+ //iText
+
+ byte [] pdf_data = pdfDataSource.getAsByteArray();
+ PdfReader reader = new PdfReader(pdf_data);
+ PDFASUtils.checkReaderPermissions(reader);
+ //pdf_stream.close();
+
+ // PERF: PDF normalization needs byte array - this is costy
+ ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
+
+ // For some reason the Reader -> ImportPage -> Writer mechanism produces
+ // problems en mass.
+ // The text extractor may not be able to extract proper text from
+ // documents
+ // created with
+ // this method (although it works when a Table is appended)... very
+ // fragile.
+
+ Document document = new Document();
+
+ PdfWriter writer = PdfWriter.getInstance(document, baos);
+ document.open();
+
+ PdfContentByte cb = writer.getDirectContent();
+ for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++)
+ {
+ //Rectangle new_size = reader.getPageSize(page_num);
+ //logger_.info("PageSize with no rotaion: Pagenr:"+page_num+" Size: "+new_size);
+ //document.setPageSize(new_size);
+ Rectangle new_size_withrot =reader.getPageSizeWithRotation(page_num);
+ if (page_num == 1)
+ {
+ //setFirstPageRotation(new_size_withrot.getRotation());
+ first_page_rotation = new_size_withrot.getRotation();
+ //logger_.info("iText first_page_rotation="+new_size_withrot.getRotation());
+ }
+ //logger_.info("iText set PageSize of page:"+page_num+" to: "+new_size_withrot);
+ //document.setPageSize(new_size);
+ document.setPageSize(new_size_withrot);
+ document.newPage();
+
+ PdfImportedPage page = writer.getImportedPage(reader, page_num);
+ // note that this will add an xobject form to the doc.
+ // the xobject form contains the content of the page.
+ cb.addTemplate(page, 0, 0);
+
+ // wprinz: debugging
+ // cb.beginText();
+ // cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA,
+ // BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14);
+ // cb.showText("page " + page_num);
+ // cb.endText();
+ // wprinz: end debugging
+ }
+
+ document.close();
+
+ // for (int i = 1; i <= reader.getNumberOfPages(); i++)
+ // {
+ // Rectangle rect = reader.getBoxSize(i, "bleed");
+ // logger_.debug("rect[" + i + "] = " + rect);
+ // }
+
+ baos.close();
+ byte[] normalizedPDF = baos.toByteArray();
+
+ ByteArrayInputStream bais = new ByteArrayInputStream(normalizedPDF);
+ //PDFBox-parser
+ PDFParser parser = new PDFParser(bais);
+ File temporary_dir = SettingsReader.getTemporaryDirectory();
+ //logger_.info("temporary_dir="+temporary_dir.getAbsolutePath());
+ parser.setTempDirectory(temporary_dir);
+ parser.parse();
+
+ PDDocument doc = parser.getPDDocument();
+ //System.out.println("pdfBox.getNumberOfPages()"+doc.getNumberOfPages());
+
+ PDFTextStripper stripper = new PDFTextStripper();
+ stripper.setSortByPosition(false);
+ stripper.setGetFirstPageRotationFromThis(true);
+ stripper.setFirstPageRotation(first_page_rotation);
+
+ // stripper.setStartPage(4);
+ // stripper.setEndPage(4);
+ logger_.debug("TextualSignator extractTextTextual: Begin stripping text");
+ String text;
+ try {
+ text = stripper.getText(doc, encoding);
+ } catch (Exception e) {
+ throw new PDFDocumentException(ErrorCode.TEXT_EXTRACTION_EXCEPTION, "Unable to extract textual content.", e);
+ }
+ logger_.debug("TextualSignator extractTextTextual: Stripping text ended");
+
+ doc.close();
+ //logger_.debug("TextualSignator extractTextTextual="+text);
+ return text;
+
+ }
+ catch (IllegalArgumentException e)
+ {
+ throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e);
+ }
+ catch (IOException e)
+ {
+ throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e);
+ }
+ catch (DocumentException e)
+ {
+ throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e);
+ }
+ }
+
+ /**
+ * Normalizes a given binary PDF to a version PDFbox can handle correctly.
+ *
+ * <p>
+ * PDFbox has serious problems with documents that use incremental updates or
+ * XObject forms. Therefor use this to remove incremental updates and create a
+ * streamlined document.
+ * </p>
+ *
+ * <p>
+ * Note that this has nothing to do with text normalization. It just unifies
+ * the PDF documents that are fed into PDFbox for text extraction and page
+ * length determination.
+ * </p>
+ *
+ * @param input_pdf
+ * The input pdf to be normalized.
+ * @return Returns the normalized pdf.
+ * @throws IOException
+ * @throws DocumentException
+ * @throws PDFDocumentException
+ */
+ public static byte[] normalizePDF(PdfDataSource pdfDataSource) throws IOException, DocumentException, PDFDocumentException
+ {
+ //iText
+ byte [] pdf_data = pdfDataSource.getAsByteArray();
+ PdfReader reader = new PdfReader(pdf_data);
+ PDFASUtils.checkReaderPermissions(reader);
+ //input_pdf.close();
+
+ // PERF: PDF Normalization needs byte array
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ // For some reason the Reader -> ImportPage -> Writer mechanism produces
+ // problems en mass.
+ // The text extractor may not be able to extract proper text from
+ // documents
+ // created with
+ // this method (although it works when a Table is appended)... very
+ // fragile.
+
+ Document document = new Document();
+
+ PdfWriter writer = PdfWriter.getInstance(document, baos);
+ document.open();
+
+ PdfContentByte cb = writer.getDirectContent();
+ for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++)
+ {
+ Rectangle new_size_withrot =reader.getPageSizeWithRotation(page_num);
+ document.setPageSize(new_size_withrot);
+ document.newPage();
+ PdfImportedPage page = writer.getImportedPage(reader, page_num);
+ // note that this will add an xobject form to the doc.
+ // the xobject form contains the content of the page.
+ cb.addTemplate(page, 0, 0);
+
+ // wprinz: debugging
+ // cb.beginText();
+ // cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA,
+ // BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14);
+ // cb.showText("page " + page_num);
+ // cb.endText();
+ // wprinz: end debugging
+ }
+
+ document.close();
+
+ // for (int i = 1; i <= reader.getNumberOfPages(); i++)
+ // {
+ // Rectangle rect = reader.getBoxSize(i, "bleed");
+ // logger_.debug("rect[" + i + "] = " + rect);
+ // }
+
+ baos.close();
+ byte[] normalizedPDF = baos.toByteArray();
+ return normalizedPDF;
+ }
+}