/** * Copyright (c) 2006 by Know-Center, Graz, Austria * * This software is the confidential and proprietary information of Know-Center, * Graz, Austria. You shall not disclose such Confidential Information and shall * use it only in accordance with the terms of the license agreement you entered * into with Know-Center. * * KNOW-CENTER MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR * NON-INFRINGEMENT. KNOW-CENTER SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY * LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS * DERIVATIVES. * * $Id: TextualSignature.java,v 1.4 2006/10/31 08:12:45 wprinz Exp $ */ package at.knowcenter.wag.egov.egiz.pdf; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.log4j.Logger; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDPage; import org.pdfbox.pdmodel.PDResources; import org.pdfbox.pdmodel.graphics.xobject.PDXObjectImage; import org.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.pdfbox.util.PDFTextStripper; import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo; import at.gv.egiz.pdfas.exceptions.pdf.TextExtractionException; import at.gv.egiz.pdfas.framework.input.PdfDataSource; import at.gv.egiz.pdfas.performance.PerformanceCounters; import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger; import at.knowcenter.wag.egov.egiz.cfg.SettingsReader; import com.lowagie.text.Document; import com.lowagie.text.DocumentException; import com.lowagie.text.Rectangle; import com.lowagie.text.pdf.PdfContentByte; import com.lowagie.text.pdf.PdfImportedPage; import com.lowagie.text.pdf.PdfReader; import com.lowagie.text.pdf.PdfWriter; /** * Contains helper function for textual signatures. * * @author wprinz */ public class TextualSignature { /** * The logger definition. */ private static final Logger logger_ = ConfigLogger.getLogger(TextualSignature.class); /** * Extracts the document text from a given pdf. * * @param pdf_stream * The pdf_input stream. * @return Returns the extracted document text. * @throws TextExtractionException * Forwarded exception. */ public static String extractTextTextual(PdfDataSource pdfDataSource) throws TextExtractionException { PerformanceCounters.textExtractions.increment(); try { int first_page_rotation = 0; // logger_.debug("===================================================="); // logger_.debug("extractText:"); // For text extraction, create a temporary object with iText just as the // one // created // when being signed, but of course without adding content. // byte[] bytes = normalizePDF(pdf_stream); //iText byte [] pdf_data = pdfDataSource.getAsByteArray(); PdfReader reader = new PdfReader(pdf_data); if (!reader.isOpenedWithFullPermissions()) { // cannot perform modification and extraction throw new TextExtractionException("Document is protected"); } //pdf_stream.close(); // PERF: PDF normalization needs byte array - this is costy ByteArrayOutputStream baos = new ByteArrayOutputStream(4096); // For some reason the Reader -> ImportPage -> Writer mechanism produces // problems en mass. // The text extractor may not be able to extract proper text from // documents // created with // this method (although it works when a Table is appended)... very // fragile. Document document = new Document(); PdfWriter writer = PdfWriter.getInstance(document, baos); document.open(); PdfContentByte cb = writer.getDirectContent(); for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++) { //Rectangle new_size = reader.getPageSize(page_num); //logger_.info("PageSize with no rotaion: Pagenr:"+page_num+" Size: "+new_size); //document.setPageSize(new_size); Rectangle new_size_withrot =reader.getPageSizeWithRotation(page_num); if (page_num == 1) { //setFirstPageRotation(new_size_withrot.getRotation()); first_page_rotation = new_size_withrot.getRotation(); //logger_.info("iText first_page_rotation="+new_size_withrot.getRotation()); } //logger_.info("iText set PageSize of page:"+page_num+" to: "+new_size_withrot); //document.setPageSize(new_size); document.setPageSize(new_size_withrot); document.newPage(); PdfImportedPage page = writer.getImportedPage(reader, page_num); // note that this will add an xobject form to the doc. // the xobject form contains the content of the page. cb.addTemplate(page, 0, 0); // wprinz: debugging // cb.beginText(); // cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA, // BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14); // cb.showText("page " + page_num); // cb.endText(); // wprinz: end debugging } document.close(); // for (int i = 1; i <= reader.getNumberOfPages(); i++) // { // Rectangle rect = reader.getBoxSize(i, "bleed"); // logger_.debug("rect[" + i + "] = " + rect); // } baos.close(); byte[] normalizedPDF = baos.toByteArray(); ByteArrayInputStream bais = new ByteArrayInputStream(normalizedPDF); //PDFBox-parser PDFParser parser = new PDFParser(bais); File temporary_dir = SettingsReader.getTemporaryDirectory(); //logger_.info("temporary_dir="+temporary_dir.getAbsolutePath()); parser.setTempDirectory(temporary_dir); parser.parse(); PDDocument doc = parser.getPDDocument(); //System.out.println("pdfBox.getNumberOfPages()"+doc.getNumberOfPages()); PDFTextStripper stripper = new PDFTextStripper(); stripper.setSortByPosition(false); stripper.setGetFirstPageRotationFromThis(true); stripper.setFirstPageRotation(first_page_rotation); // stripper.setStartPage(4); // stripper.setEndPage(4); logger_.debug("TextualSignator extractTextTextual: Begin stripping text"); String text = stripper.getText(doc); logger_.debug("TextualSignator extractTextTextual: Stripping text ended"); doc.close(); //logger_.debug("TextualSignator extractTextTextual="+text); return text; } catch (IllegalArgumentException e) { throw new TextExtractionException(e); } catch (IOException e) { throw new TextExtractionException(e); } catch (DocumentException e) { throw new TextExtractionException(e); } } /** * Extract non textual data from pdf. * @param pdfDataSource * * @see org.pdfbox.ExtractImages * * @return List of {@link NonTextObjectInfo} */ public static List extractNonTextInfo(PdfDataSource pdfDataSource) { PDDocument doc = null; try { doc = PDDocument.load(pdfDataSource.createInputStream()); List res = extractNonTextInfo(doc); doc.close(); return res; } catch (IOException e) { logger_.error("Error extracting images from pdf. No NonTextObjectInfo available.", e); return new ArrayList(); } finally { if (doc != null) { try { doc.close(); } catch (IOException e) { logger_.error("error closing pddocument", e); } } } } /** * Extract non textual data from pdf. * * @see org.pdfbox.ExtractImages * * @param document * @return List of {@link NonTextObjectInfo} */ public static List extractNonTextInfo(PDDocument document) { // extraction does not work with the normalized pdf from extractTextTextual logger_.debug("going to extract non text objects"); List objectInfos = new ArrayList(); List pages = document.getDocumentCatalog().getAllPages(); Iterator iter = pages.iterator(); int pageNr = 0; while (iter.hasNext()) { pageNr++; PDPage page = (PDPage) iter.next(); doExtractFromResources(objectInfos, pageNr, page); //doExtractAnnotations(objectInfos, pageNr, page); // does not work with pdf-box 0.7.2 -> 0.8.0 needed } if (logger_.isDebugEnabled()) { logger_.debug("extracted non textual objects count: " + objectInfos.size()); } logger_.debug("going to extract non text objects"); return objectInfos; } private static void doExtractAnnotations(List objectInfos, int pageNr, PDPage page) { List annotations; try { annotations = page.getAnnotations(); } catch (IOException e) { logger_.error("Error extracting annotations from pdf. No NonTextObjectInfo-annotations available.", e); return; } for (Iterator it = annotations.iterator(); it.hasNext();) { try { PDAnnotation anno = (PDAnnotation) it.next(); NonTextObjectInfo objInfo = new NonTextObjectInfo(); objInfo.setName(anno.getDictionary().getString( "NM" )); objInfo.setObjectType(NonTextObjectInfo.TYPE_ANNOTATION); objInfo.setSubType(anno.getDictionary().getString("Subtype") + "/" + anno.getDictionary().getString("Subj")); objInfo.setPageNr(pageNr); objInfo.setHeight(anno.getRectangle().getHeight()); objInfo.setWidth(anno.getRectangle().getWidth()); } catch (Exception ex) { logger_.info("error reading non text object info key " + ex); } } } private static void doExtractFromResources(List objectInfos, int pageNr, PDPage page) { PDResources resources = page.getResources(); Map images; try { images = resources.getImages(); } catch (IOException e) { logger_.error("Error extracting images from pdf. No NonTextObjectInfo-Images available.", e); return; } if (images != null) { Iterator imageIter = images.keySet().iterator(); while (imageIter.hasNext()) { NonTextObjectInfo objInfo = new NonTextObjectInfo(); String key = (String) imageIter.next(); PDXObjectImage image = (PDXObjectImage) images.get(key); System.err.println(image); objInfo.setHeight(image.getHeight()); objInfo.setWidth(image.getWidth()); objInfo.setName(key + image.getSuffix()); objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE); objInfo.setPageNr(pageNr); objectInfos.add(objInfo); if (logger_.isDebugEnabled()) { logger_.debug("Found non text object: " + objInfo.toString()); } } } } /** * Normalizes a given binary PDF to a version PDFbox can handle correctly. * *

* PDFbox has serious problems with documents that use incremental updates or * XObject forms. Therefor use this to remove incremental updates and create a * streamlined document. *

* *

* Note that this has nothing to do with text normalization. It just unifies * the PDF documents that are fed into PDFbox for text extraction and page * length determination. *

* * @param input_pdf * The input pdf to be normalized. * @return Returns the normalized pdf. * @throws IOException * @throws DocumentException */ public static byte[] normalizePDF(PdfDataSource pdfDataSource) throws IOException, DocumentException { //iText byte [] pdf_data = pdfDataSource.getAsByteArray(); PdfReader reader = new PdfReader(pdf_data); //input_pdf.close(); // PERF: PDF Normalization needs byte array ByteArrayOutputStream baos = new ByteArrayOutputStream(); // For some reason the Reader -> ImportPage -> Writer mechanism produces // problems en mass. // The text extractor may not be able to extract proper text from // documents // created with // this method (although it works when a Table is appended)... very // fragile. Document document = new Document(); PdfWriter writer = PdfWriter.getInstance(document, baos); document.open(); PdfContentByte cb = writer.getDirectContent(); for (int page_num = 1; page_num <= reader.getNumberOfPages(); page_num++) { Rectangle new_size_withrot =reader.getPageSizeWithRotation(page_num); document.setPageSize(new_size_withrot); document.newPage(); PdfImportedPage page = writer.getImportedPage(reader, page_num); // note that this will add an xobject form to the doc. // the xobject form contains the content of the page. cb.addTemplate(page, 0, 0); // wprinz: debugging // cb.beginText(); // cb.setFontAndSize(BaseFont.createFont(BaseFont.HELVETICA, // BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 14); // cb.showText("page " + page_num); // cb.endText(); // wprinz: end debugging } document.close(); // for (int i = 1; i <= reader.getNumberOfPages(); i++) // { // Rectangle rect = reader.getBoxSize(i, "bleed"); // logger_.debug("rect[" + i + "] = " + rect); // } baos.close(); byte[] normalizedPDF = baos.toByteArray(); return normalizedPDF; } }