From 535a04fa05f739ec16dd81666e3b0f82dfbd442d Mon Sep 17 00:00:00 2001 From: tknall Date: Wed, 9 Jan 2013 15:41:29 +0000 Subject: pdf-as-lib maven project files moved to pdf-as-lib git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../wag/egov/egiz/pdf/TextualSignature.java | 282 +++++++++++++++++++++ 1 file changed, 282 insertions(+) create mode 100644 pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java (limited to 'pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java') diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java new file mode 100644 index 0000000..35a0768 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java @@ -0,0 +1,282 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + * + * $Id: TextualSignature.java,v 1.4 2006/10/31 08:12:45 wprinz Exp $ + */ +package at.knowcenter.wag.egov.egiz.pdf; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; + +import org.apache.log4j.Logger; +import org.pdfbox.pdfparser.PDFParser; +import org.pdfbox.pdmodel.PDDocument; +import org.pdfbox.util.PDFTextStripper; + +import at.gv.egiz.pdfas.exceptions.ErrorCode; +import at.gv.egiz.pdfas.framework.input.PdfDataSource; +import at.gv.egiz.pdfas.performance.PerformanceCounters; +import at.gv.egiz.pdfas.utils.PDFASUtils; +import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger; +import at.knowcenter.wag.egov.egiz.cfg.SettingsReader; +import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException; + +import com.lowagie.text.Document; +import com.lowagie.text.DocumentException; +import com.lowagie.text.Rectangle; +import com.lowagie.text.pdf.PdfContentByte; +import com.lowagie.text.pdf.PdfImportedPage; +import com.lowagie.text.pdf.PdfReader; +import com.lowagie.text.pdf.PdfWriter; + +/** + * Contains helper function for textual signatures. + * + * @author wprinz + */ +public class TextualSignature +{ + /** + * The logger definition. + */ + private static final Logger logger_ = ConfigLogger.getLogger(TextualSignature.class); + + /** + * Extracts the document text from a given pdf. + * + * @param pdf_stream + * The pdf_input stream. + * @return Returns the extracted document text. + * @throws PDFDocumentException + * @throws TextExtractionException + * Forwarded exception. + */ + public static String extractTextTextual(PdfDataSource pdfDataSource, String encoding) throws PDFDocumentException + { + PerformanceCounters.textExtractions.increment(); + + try + { + int first_page_rotation = 0; + // logger_.debug("===================================================="); + // logger_.debug("extractText:"); + + // For text extraction, create a temporary object with iText just as the + // one + // created + // when being signed, but of course without adding content. + + + // byte[] bytes = normalizePDF(pdf_stream); + //iText + + byte [] pdf_data = pdfDataSource.getAsByteArray(); + PdfReader reader = new PdfReader(pdf_data); + PDFASUtils.checkReaderPermissions(reader); + //pdf_stream.close(); + + // PERF: PDF normalization needs byte array - this is costy + ByteArrayOutputStream baos = new ByteArrayOutputStream(4096); + + // For some reason the Reader -> ImportPage -> Writer mechanism produces + // problems en mass. + // The text extractor may not be able to extract proper text from + // documents + // created with + // this method (although it works when a Table is appended)... very + // fragile. + + Document document = new Document(); + + PdfWriter writer = PdfWriter.getInstance(document, baos); + document.open(); + + PdfContentByte cb = writer.getDirectContent(); + for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++) + { + //Rectangle new_size = reader.getPageSize(page_num); + //logger_.info("PageSize with no rotaion: Pagenr:"+page_num+" Size: "+new_size); + //document.setPageSize(new_size); + Rectangle new_size_withrot =reader.getPageSizeWithRotation(page_num); + if (page_num == 1) + { + //setFirstPageRotation(new_size_withrot.getRotation()); + first_page_rotation = new_size_withrot.getRotation(); + //logger_.info("iText first_page_rotation="+new_size_withrot.getRotation()); + } + //logger_.info("iText set PageSize of page:"+page_num+" to: "+new_size_withrot); + //document.setPageSize(new_size); + document.setPageSize(new_size_withrot); + document.newPage(); + + PdfImportedPage page = writer.getImportedPage(reader, page_num); + // note that this will add an xobject form to the doc. + // the xobject form contains the content of the page. + cb.addTemplate(page, 0, 0); + + // wprinz: debugging + // cb.beginText(); + // cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA, + // BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14); + // cb.showText("page " + page_num); + // cb.endText(); + // wprinz: end debugging + } + + document.close(); + + // for (int i = 1; i <= reader.getNumberOfPages(); i++) + // { + // Rectangle rect = reader.getBoxSize(i, "bleed"); + // logger_.debug("rect[" + i + "] = " + rect); + // } + + baos.close(); + byte[] normalizedPDF = baos.toByteArray(); + + ByteArrayInputStream bais = new ByteArrayInputStream(normalizedPDF); + //PDFBox-parser + PDFParser parser = new PDFParser(bais); + File temporary_dir = SettingsReader.getTemporaryDirectory(); + //logger_.info("temporary_dir="+temporary_dir.getAbsolutePath()); + parser.setTempDirectory(temporary_dir); + parser.parse(); + + PDDocument doc = parser.getPDDocument(); + //System.out.println("pdfBox.getNumberOfPages()"+doc.getNumberOfPages()); + + PDFTextStripper stripper = new PDFTextStripper(); + stripper.setSortByPosition(false); + stripper.setGetFirstPageRotationFromThis(true); + stripper.setFirstPageRotation(first_page_rotation); + + // stripper.setStartPage(4); + // stripper.setEndPage(4); + logger_.debug("TextualSignator extractTextTextual: Begin stripping text"); + String text; + try { + text = stripper.getText(doc, encoding); + } catch (Exception e) { + throw new PDFDocumentException(ErrorCode.TEXT_EXTRACTION_EXCEPTION, "Unable to extract textual content.", e); + } + logger_.debug("TextualSignator extractTextTextual: Stripping text ended"); + + doc.close(); + //logger_.debug("TextualSignator extractTextTextual="+text); + return text; + + } + catch (IllegalArgumentException e) + { + throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e); + } + catch (IOException e) + { + throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e); + } + catch (DocumentException e) + { + throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e); + } + } + + /** + * Normalizes a given binary PDF to a version PDFbox can handle correctly. + * + *

+ * PDFbox has serious problems with documents that use incremental updates or + * XObject forms. Therefor use this to remove incremental updates and create a + * streamlined document. + *

+ * + *

+ * Note that this has nothing to do with text normalization. It just unifies + * the PDF documents that are fed into PDFbox for text extraction and page + * length determination. + *

+ * + * @param input_pdf + * The input pdf to be normalized. + * @return Returns the normalized pdf. + * @throws IOException + * @throws DocumentException + * @throws PDFDocumentException + */ + public static byte[] normalizePDF(PdfDataSource pdfDataSource) throws IOException, DocumentException, PDFDocumentException + { + //iText + byte [] pdf_data = pdfDataSource.getAsByteArray(); + PdfReader reader = new PdfReader(pdf_data); + PDFASUtils.checkReaderPermissions(reader); + //input_pdf.close(); + + // PERF: PDF Normalization needs byte array + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + // For some reason the Reader -> ImportPage -> Writer mechanism produces + // problems en mass. + // The text extractor may not be able to extract proper text from + // documents + // created with + // this method (although it works when a Table is appended)... very + // fragile. + + Document document = new Document(); + + PdfWriter writer = PdfWriter.getInstance(document, baos); + document.open(); + + PdfContentByte cb = writer.getDirectContent(); + for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++) + { + Rectangle new_size_withrot =reader.getPageSizeWithRotation(page_num); + document.setPageSize(new_size_withrot); + document.newPage(); + PdfImportedPage page = writer.getImportedPage(reader, page_num); + // note that this will add an xobject form to the doc. + // the xobject form contains the content of the page. + cb.addTemplate(page, 0, 0); + + // wprinz: debugging + // cb.beginText(); + // cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA, + // BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14); + // cb.showText("page " + page_num); + // cb.endText(); + // wprinz: end debugging + } + + document.close(); + + // for (int i = 1; i <= reader.getNumberOfPages(); i++) + // { + // Rectangle rect = reader.getBoxSize(i, "bleed"); + // logger_.debug("rect[" + i + "] = " + rect); + // } + + baos.close(); + byte[] normalizedPDF = baos.toByteArray(); + return normalizedPDF; + } +} -- cgit v1.2.3