/** * Copyright 2006 by Know-Center, Graz, Austria * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a * joint initiative of the Federal Chancellery Austria and Graz University of * Technology. * * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by * the European Commission - subsequent versions of the EUPL (the "Licence"); * You may not use this work except in compliance with the Licence. * You may obtain a copy of the Licence at: * http://www.osor.eu/eupl/ * * Unless required by applicable law or agreed to in writing, software * distributed under the Licence is distributed on an "AS IS" basis, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the Licence for the specific language governing permissions and * limitations under the Licence. * * This product combines work with different licenses. See the "NOTICE" text * file for details on the various modules and licenses. * The "NOTICE" text file is part of the distribution. Any derivative works * that you distribute must include a readable copy of the "NOTICE" text file. * * $Id: TextualSignature.java,v 1.4 2006/10/31 08:12:45 wprinz Exp $ */ package at.knowcenter.wag.egov.egiz.pdf; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import org.apache.log4j.Logger; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.util.PDFTextStripper; import at.gv.egiz.pdfas.exceptions.ErrorCode; import at.gv.egiz.pdfas.framework.input.PdfDataSource; import at.gv.egiz.pdfas.performance.PerformanceCounters; import at.gv.egiz.pdfas.utils.PDFASUtils; import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger; import at.knowcenter.wag.egov.egiz.cfg.SettingsReader; import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException; import com.lowagie.text.Document; import com.lowagie.text.DocumentException; import com.lowagie.text.Rectangle; import com.lowagie.text.pdf.PdfContentByte; import com.lowagie.text.pdf.PdfImportedPage; import com.lowagie.text.pdf.PdfReader; import com.lowagie.text.pdf.PdfWriter; /** * Contains helper function for textual signatures. * * @author wprinz */ public class TextualSignature { /** * The logger definition. */ private static final Logger logger_ = ConfigLogger.getLogger(TextualSignature.class); /** * Extracts the document text from a given pdf. * * @param pdf_stream * The pdf_input stream. * @return Returns the extracted document text. * @throws PDFDocumentException * @throws TextExtractionException * Forwarded exception. */ public static String extractTextTextual(PdfDataSource pdfDataSource, String encoding) throws PDFDocumentException { PerformanceCounters.textExtractions.increment(); try { int first_page_rotation = 0; // logger_.debug("===================================================="); // logger_.debug("extractText:"); // For text extraction, create a temporary object with iText just as the // one // created // when being signed, but of course without adding content. // byte[] bytes = normalizePDF(pdf_stream); //iText byte [] pdf_data = pdfDataSource.getAsByteArray(); PdfReader reader = new PdfReader(pdf_data); PDFASUtils.checkReaderPermissions(reader); //pdf_stream.close(); // PERF: PDF normalization needs byte array - this is costy ByteArrayOutputStream baos = new ByteArrayOutputStream(4096); // For some reason the Reader -> ImportPage -> Writer mechanism produces // problems en mass. // The text extractor may not be able to extract proper text from // documents // created with // this method (although it works when a Table is appended)... very // fragile. Document document = new Document(); PdfWriter writer = PdfWriter.getInstance(document, baos); document.open(); PdfContentByte cb = writer.getDirectContent(); for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++) { //Rectangle new_size = reader.getPageSize(page_num); //logger_.info("PageSize with no rotaion: Pagenr:"+page_num+" Size: "+new_size); //document.setPageSize(new_size); Rectangle new_size_withrot =reader.getPageSizeWithRotation(page_num); if (page_num == 1) { //setFirstPageRotation(new_size_withrot.getRotation()); first_page_rotation = new_size_withrot.getRotation(); //logger_.info("iText first_page_rotation="+new_size_withrot.getRotation()); } //logger_.info("iText set PageSize of page:"+page_num+" to: "+new_size_withrot); //document.setPageSize(new_size); document.setPageSize(new_size_withrot); document.newPage(); PdfImportedPage page = writer.getImportedPage(reader, page_num); // note that this will add an xobject form to the doc. // the xobject form contains the content of the page. cb.addTemplate(page, 0, 0); // wprinz: debugging // cb.beginText(); // cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA, // BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14); // cb.showText("page " + page_num); // cb.endText(); // wprinz: end debugging } document.close(); // for (int i = 1; i <= reader.getNumberOfPages(); i++) // { // Rectangle rect = reader.getBoxSize(i, "bleed"); // logger_.debug("rect[" + i + "] = " + rect); // } baos.close(); byte[] normalizedPDF = baos.toByteArray(); ByteArrayInputStream bais = new ByteArrayInputStream(normalizedPDF); //PDFBox-parser PDFParser parser = new PDFParser(bais); File temporary_dir = SettingsReader.getTemporaryDirectory(); //logger_.info("temporary_dir="+temporary_dir.getAbsolutePath()); parser.setTempDirectory(temporary_dir); parser.parse(); PDDocument doc = parser.getPDDocument(); //System.out.println("pdfBox.getNumberOfPages()"+doc.getNumberOfPages()); PDFTextStripper stripper = new PDFTextStripper(); stripper.setSortByPosition(false); stripper.setGetFirstPageRotationFromThis(true); stripper.setFirstPageRotation(first_page_rotation); // stripper.setStartPage(4); // stripper.setEndPage(4); logger_.debug("TextualSignator extractTextTextual: Begin stripping text"); String text; try { text = stripper.getText(doc, encoding); } catch (Exception e) { throw new PDFDocumentException(ErrorCode.TEXT_EXTRACTION_EXCEPTION, "Unable to extract textual content.", e); } logger_.debug("TextualSignator extractTextTextual: Stripping text ended"); doc.close(); //logger_.debug("TextualSignator extractTextTextual="+text); return text; } catch (IllegalArgumentException e) { throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e); } catch (IOException e) { throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e); } catch (DocumentException e) { throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e); } } /** * Normalizes a given binary PDF to a version PDFbox can handle correctly. * *

* PDFbox has serious problems with documents that use incremental updates or * XObject forms. Therefor use this to remove incremental updates and create a * streamlined document. *

* *

* Note that this has nothing to do with text normalization. It just unifies * the PDF documents that are fed into PDFbox for text extraction and page * length determination. *

* * @param input_pdf * The input pdf to be normalized. * @return Returns the normalized pdf. * @throws IOException * @throws DocumentException * @throws PDFDocumentException */ public static byte[] normalizePDF(PdfDataSource pdfDataSource) throws IOException, DocumentException, PDFDocumentException { //iText byte [] pdf_data = pdfDataSource.getAsByteArray(); PdfReader reader = new PdfReader(pdf_data); PDFASUtils.checkReaderPermissions(reader); //input_pdf.close(); // PERF: PDF Normalization needs byte array ByteArrayOutputStream baos = new ByteArrayOutputStream(); // For some reason the Reader -> ImportPage -> Writer mechanism produces // problems en mass. // The text extractor may not be able to extract proper text from // documents // created with // this method (although it works when a Table is appended)... very // fragile. Document document = new Document(); PdfWriter writer = PdfWriter.getInstance(document, baos); document.open(); PdfContentByte cb = writer.getDirectContent(); for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++) { Rectangle new_size_withrot =reader.getPageSizeWithRotation(page_num); document.setPageSize(new_size_withrot); document.newPage(); PdfImportedPage page = writer.getImportedPage(reader, page_num); // note that this will add an xobject form to the doc. // the xobject form contains the content of the page. cb.addTemplate(page, 0, 0); // wprinz: debugging // cb.beginText(); // cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA, // BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14); // cb.showText("page " + page_num); // cb.endText(); // wprinz: end debugging } document.close(); // for (int i = 1; i <= reader.getNumberOfPages(); i++) // { // Rectangle rect = reader.getBoxSize(i, "bleed"); // logger_.debug("rect[" + i + "] = " + rect); // } baos.close(); byte[] normalizedPDF = baos.toByteArray(); return normalizedPDF; } }