aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
diff options
context:
space:
mode:
authortknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>2013-01-09 15:41:29 +0000
committertknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>2013-01-09 15:41:29 +0000
commit535a04fa05f739ec16dd81666e3b0f82dfbd442d (patch)
tree0804f301c1a9ceb303a8441b7b29244fc8eb7ff0 /src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
parent1efaf6fd5619dfa95c9d7e8c71eda4c2ffba4998 (diff)
downloadpdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.tar.gz
pdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.tar.bz2
pdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.zip
pdf-as-lib maven project files moved to pdf-as-lib
git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c
Diffstat (limited to 'src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java')
-rw-r--r--src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java282
1 files changed, 0 insertions, 282 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
deleted file mode 100644
index 35a0768..0000000
--- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
+++ /dev/null
@@ -1,282 +0,0 @@
-/**
- * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
- * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
- * joint initiative of the Federal Chancellery Austria and Graz University of
- * Technology.
- *
- * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
- * the European Commission - subsequent versions of the EUPL (the "Licence");
- * You may not use this work except in compliance with the Licence.
- * You may obtain a copy of the Licence at:
- * http://www.osor.eu/eupl/
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the Licence is distributed on an "AS IS" basis,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the Licence for the specific language governing permissions and
- * limitations under the Licence.
- *
- * This product combines work with different licenses. See the "NOTICE" text
- * file for details on the various modules and licenses.
- * The "NOTICE" text file is part of the distribution. Any derivative works
- * that you distribute must include a readable copy of the "NOTICE" text file.
- *
- * $Id: TextualSignature.java,v 1.4 2006/10/31 08:12:45 wprinz Exp $
- */
-package at.knowcenter.wag.egov.egiz.pdf;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.log4j.Logger;
-import org.pdfbox.pdfparser.PDFParser;
-import org.pdfbox.pdmodel.PDDocument;
-import org.pdfbox.util.PDFTextStripper;
-
-import at.gv.egiz.pdfas.exceptions.ErrorCode;
-import at.gv.egiz.pdfas.framework.input.PdfDataSource;
-import at.gv.egiz.pdfas.performance.PerformanceCounters;
-import at.gv.egiz.pdfas.utils.PDFASUtils;
-import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger;
-import at.knowcenter.wag.egov.egiz.cfg.SettingsReader;
-import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException;
-
-import com.lowagie.text.Document;
-import com.lowagie.text.DocumentException;
-import com.lowagie.text.Rectangle;
-import com.lowagie.text.pdf.PdfContentByte;
-import com.lowagie.text.pdf.PdfImportedPage;
-import com.lowagie.text.pdf.PdfReader;
-import com.lowagie.text.pdf.PdfWriter;
-
-/**
- * Contains helper function for textual signatures.
- *
- * @author wprinz
- */
-public class TextualSignature
-{
- /**
- * The logger definition.
- */
- private static final Logger logger_ = ConfigLogger.getLogger(TextualSignature.class);
-
- /**
- * Extracts the document text from a given pdf.
- *
- * @param pdf_stream
- * The pdf_input stream.
- * @return Returns the extracted document text.
- * @throws PDFDocumentException
- * @throws TextExtractionException
- * Forwarded exception.
- */
- public static String extractTextTextual(PdfDataSource pdfDataSource, String encoding) throws PDFDocumentException
- {
- PerformanceCounters.textExtractions.increment();
-
- try
- {
- int first_page_rotation = 0;
- // logger_.debug("====================================================");
- // logger_.debug("extractText:");
-
- // For text extraction, create a temporary object with iText just as the
- // one
- // created
- // when being signed, but of course without adding content.
-
-
- // byte[] bytes = normalizePDF(pdf_stream);
- //iText
-
- byte [] pdf_data = pdfDataSource.getAsByteArray();
- PdfReader reader = new PdfReader(pdf_data);
- PDFASUtils.checkReaderPermissions(reader);
- //pdf_stream.close();
-
- // PERF: PDF normalization needs byte array - this is costy
- ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
-
- // For some reason the Reader -> ImportPage -> Writer mechanism produces
- // problems en mass.
- // The text extractor may not be able to extract proper text from
- // documents
- // created with
- // this method (although it works when a Table is appended)... very
- // fragile.
-
- Document document = new Document();
-
- PdfWriter writer = PdfWriter.getInstance(document, baos);
- document.open();
-
- PdfContentByte cb = writer.getDirectContent();
- for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++)
- {
- //Rectangle new_size = reader.getPageSize(page_num);
- //logger_.info("PageSize with no rotaion: Pagenr:"+page_num+" Size: "+new_size);
- //document.setPageSize(new_size);
- Rectangle new_size_withrot =reader.getPageSizeWithRotation(page_num);
- if (page_num == 1)
- {
- //setFirstPageRotation(new_size_withrot.getRotation());
- first_page_rotation = new_size_withrot.getRotation();
- //logger_.info("iText first_page_rotation="+new_size_withrot.getRotation());
- }
- //logger_.info("iText set PageSize of page:"+page_num+" to: "+new_size_withrot);
- //document.setPageSize(new_size);
- document.setPageSize(new_size_withrot);
- document.newPage();
-
- PdfImportedPage page = writer.getImportedPage(reader, page_num);
- // note that this will add an xobject form to the doc.
- // the xobject form contains the content of the page.
- cb.addTemplate(page, 0, 0);
-
- // wprinz: debugging
- // cb.beginText();
- // cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA,
- // BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14);
- // cb.showText("page " + page_num);
- // cb.endText();
- // wprinz: end debugging
- }
-
- document.close();
-
- // for (int i = 1; i <= reader.getNumberOfPages(); i++)
- // {
- // Rectangle rect = reader.getBoxSize(i, "bleed");
- // logger_.debug("rect[" + i + "] = " + rect);
- // }
-
- baos.close();
- byte[] normalizedPDF = baos.toByteArray();
-
- ByteArrayInputStream bais = new ByteArrayInputStream(normalizedPDF);
- //PDFBox-parser
- PDFParser parser = new PDFParser(bais);
- File temporary_dir = SettingsReader.getTemporaryDirectory();
- //logger_.info("temporary_dir="+temporary_dir.getAbsolutePath());
- parser.setTempDirectory(temporary_dir);
- parser.parse();
-
- PDDocument doc = parser.getPDDocument();
- //System.out.println("pdfBox.getNumberOfPages()"+doc.getNumberOfPages());
-
- PDFTextStripper stripper = new PDFTextStripper();
- stripper.setSortByPosition(false);
- stripper.setGetFirstPageRotationFromThis(true);
- stripper.setFirstPageRotation(first_page_rotation);
-
- // stripper.setStartPage(4);
- // stripper.setEndPage(4);
- logger_.debug("TextualSignator extractTextTextual: Begin stripping text");
- String text;
- try {
- text = stripper.getText(doc, encoding);
- } catch (Exception e) {
- throw new PDFDocumentException(ErrorCode.TEXT_EXTRACTION_EXCEPTION, "Unable to extract textual content.", e);
- }
- logger_.debug("TextualSignator extractTextTextual: Stripping text ended");
-
- doc.close();
- //logger_.debug("TextualSignator extractTextTextual="+text);
- return text;
-
- }
- catch (IllegalArgumentException e)
- {
- throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e);
- }
- catch (IOException e)
- {
- throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e);
- }
- catch (DocumentException e)
- {
- throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e);
- }
- }
-
- /**
- * Normalizes a given binary PDF to a version PDFbox can handle correctly.
- *
- * <p>
- * PDFbox has serious problems with documents that use incremental updates or
- * XObject forms. Therefor use this to remove incremental updates and create a
- * streamlined document.
- * </p>
- *
- * <p>
- * Note that this has nothing to do with text normalization. It just unifies
- * the PDF documents that are fed into PDFbox for text extraction and page
- * length determination.
- * </p>
- *
- * @param input_pdf
- * The input pdf to be normalized.
- * @return Returns the normalized pdf.
- * @throws IOException
- * @throws DocumentException
- * @throws PDFDocumentException
- */
- public static byte[] normalizePDF(PdfDataSource pdfDataSource) throws IOException, DocumentException, PDFDocumentException
- {
- //iText
- byte [] pdf_data = pdfDataSource.getAsByteArray();
- PdfReader reader = new PdfReader(pdf_data);
- PDFASUtils.checkReaderPermissions(reader);
- //input_pdf.close();
-
- // PERF: PDF Normalization needs byte array
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- // For some reason the Reader -> ImportPage -> Writer mechanism produces
- // problems en mass.
- // The text extractor may not be able to extract proper text from
- // documents
- // created with
- // this method (although it works when a Table is appended)... very
- // fragile.
-
- Document document = new Document();
-
- PdfWriter writer = PdfWriter.getInstance(document, baos);
- document.open();
-
- PdfContentByte cb = writer.getDirectContent();
- for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++)
- {
- Rectangle new_size_withrot =reader.getPageSizeWithRotation(page_num);
- document.setPageSize(new_size_withrot);
- document.newPage();
- PdfImportedPage page = writer.getImportedPage(reader, page_num);
- // note that this will add an xobject form to the doc.
- // the xobject form contains the content of the page.
- cb.addTemplate(page, 0, 0);
-
- // wprinz: debugging
- // cb.beginText();
- // cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA,
- // BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14);
- // cb.showText("page " + page_num);
- // cb.endText();
- // wprinz: end debugging
- }
-
- document.close();
-
- // for (int i = 1; i <= reader.getNumberOfPages(); i++)
- // {
- // Rectangle rect = reader.getBoxSize(i, "bleed");
- // logger_.debug("rect[" + i + "] = " + rect);
- // }
-
- baos.close();
- byte[] normalizedPDF = baos.toByteArray();
- return normalizedPDF;
- }
-}