From 3d1a2a395a17261e6c29fb26fd424e4e92ea218a Mon Sep 17 00:00:00 2001 From: ferbas Date: Thu, 19 Nov 2009 14:24:11 +0000 Subject: added non text oject info git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@445 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../wag/egov/egiz/pdf/TextualSignature.java | 107 +++++++++++++++++++-- 1 file changed, 100 insertions(+), 7 deletions(-) diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java index 04b96fc..7f567c4 100644 --- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java +++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java @@ -21,20 +21,25 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; -import java.io.InputStream; - -import at.gv.egiz.pdfas.performance.PerformanceCounters; -import at.gv.egiz.pdfas.exceptions.pdf.TextExtractionException; -import at.gv.egiz.pdfas.framework.input.PdfDataSource; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; import org.apache.log4j.Logger; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; +import org.pdfbox.pdmodel.PDPage; +import org.pdfbox.pdmodel.PDResources; +import org.pdfbox.pdmodel.graphics.xobject.PDXObjectImage; import org.pdfbox.util.PDFTextStripper; +import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo; +import at.gv.egiz.pdfas.exceptions.pdf.TextExtractionException; +import at.gv.egiz.pdfas.framework.input.PdfDataSource; +import at.gv.egiz.pdfas.performance.PerformanceCounters; import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger; import at.knowcenter.wag.egov.egiz.cfg.SettingsReader; -import at.knowcenter.wag.egov.egiz.exceptions.PresentableException; import com.lowagie.text.Document; import com.lowagie.text.DocumentException; @@ -62,7 +67,7 @@ public class TextualSignature * @param pdf_stream * The pdf_input stream. * @return Returns the extracted document text. - * @throws PresentableException + * @throws TextExtractionException * Forwarded exception. */ public static String extractTextTextual(PdfDataSource pdfDataSource) throws TextExtractionException @@ -168,6 +173,7 @@ public class TextualSignature logger_.debug("TextualSignator extractTextTextual: Begin stripping text"); String text = stripper.getText(doc); logger_.debug("TextualSignator extractTextTextual: Stripping text ended"); + doc.close(); //logger_.debug("TextualSignator extractTextTextual="+text); return text; @@ -182,6 +188,93 @@ public class TextualSignature throw new TextExtractionException(e); } } + + /** + * Extract non textual data from pdf. + * @param pdfDataSource + * + * @see org.pdfbox.ExtractImages + * + * @return List of {@link NonTextObjectInfo} + */ + public static List extractNonTextInfo(PdfDataSource pdfDataSource) { + + PDDocument doc = null; + try { + doc = PDDocument.load(pdfDataSource.createInputStream()); + List res = extractNonTextInfo(doc); + doc.close(); + return res; + } catch (IOException e) { + logger_.error("Error extracting images from pdf. No NonTextObjectInfo available.", e); + return new ArrayList(); + } finally { + if (doc != null) { + try { + doc.close(); + } catch (IOException e) { + logger_.error("error closing pddocument", e); + } + } + } + + } + + /** + * Extract non textual data from pdf. + * + * @see org.pdfbox.ExtractImages + * + * @param document + * @return List of {@link NonTextObjectInfo} + */ + public static List extractNonTextInfo(PDDocument document) { + // extraction does not work with the normalized pdf from extractTextTextual + logger_.debug("going to extract non text objects"); + List objectInfos = new ArrayList(); + List pages = document.getDocumentCatalog().getAllPages(); + Iterator iter = pages.iterator(); + int pageNr = 0; + while (iter.hasNext()) { + pageNr++; + PDPage page = (PDPage) iter.next(); + PDResources resources = page.getResources(); + + Map images; + try { + images = resources.getImages(); + } catch (IOException e) { + logger_.error("Error extracting images from pdf. No NonTextObjectInfo available.", e); + return objectInfos; + } + if (images != null) { + Iterator imageIter = images.keySet().iterator(); + while (imageIter.hasNext()) { + NonTextObjectInfo objInfo = new NonTextObjectInfo(); + String key = (String) imageIter.next(); + PDXObjectImage image = (PDXObjectImage) images.get(key); + + objInfo.setHeight(image.getHeight()); + objInfo.setWidth(image.getWidth()); + objInfo.setName(key + image.getSuffix()); + objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE); + objInfo.setPageNr(pageNr); + objectInfos.add(objInfo); + + if (logger_.isDebugEnabled()) { + logger_.debug("Found non text object: " + objInfo.toString()); + } + } + } + } + if (logger_.isDebugEnabled()) { + logger_.debug("extracted non textual objects count: " + objectInfos.size()); + } + logger_.debug("going to extract non text objects"); + return objectInfos; + + } + /** * Normalizes a given binary PDF to a version PDFbox can handle correctly. -- cgit v1.2.3