From 0f3ea4c6a974c3ca8907d43d3353ef39ca21ecb1 Mon Sep 17 00:00:00 2001 From: ferbas Date: Thu, 26 Nov 2009 09:53:03 +0000 Subject: improved exception handling added annotation extraction (deactivated, needs pdfbox 0.8.0) git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@464 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../wag/egov/egiz/pdf/TextualSignature.java | 97 +++++++++++++++------- 1 file changed, 69 insertions(+), 28 deletions(-) (limited to 'src/main') diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java index 7f567c4..d7fbe64 100644 --- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java +++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java @@ -32,6 +32,7 @@ import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDPage; import org.pdfbox.pdmodel.PDResources; import org.pdfbox.pdmodel.graphics.xobject.PDXObjectImage; +import org.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.pdfbox.util.PDFTextStripper; import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo; @@ -91,6 +92,11 @@ public class TextualSignature byte [] pdf_data = pdfDataSource.getAsByteArray(); PdfReader reader = new PdfReader(pdf_data); + if (!reader.isOpenedWithFullPermissions()) { + // cannot perform modification and extraction + throw new TextExtractionException("Document is protected"); + + } //pdf_stream.close(); // PERF: PDF normalization needs byte array - this is costy @@ -179,6 +185,10 @@ public class TextualSignature return text; } + catch (IllegalArgumentException e) + { + throw new TextExtractionException(e); + } catch (IOException e) { throw new TextExtractionException(e); @@ -238,34 +248,8 @@ public class TextualSignature while (iter.hasNext()) { pageNr++; PDPage page = (PDPage) iter.next(); - PDResources resources = page.getResources(); - - Map images; - try { - images = resources.getImages(); - } catch (IOException e) { - logger_.error("Error extracting images from pdf. No NonTextObjectInfo available.", e); - return objectInfos; - } - if (images != null) { - Iterator imageIter = images.keySet().iterator(); - while (imageIter.hasNext()) { - NonTextObjectInfo objInfo = new NonTextObjectInfo(); - String key = (String) imageIter.next(); - PDXObjectImage image = (PDXObjectImage) images.get(key); - - objInfo.setHeight(image.getHeight()); - objInfo.setWidth(image.getWidth()); - objInfo.setName(key + image.getSuffix()); - objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE); - objInfo.setPageNr(pageNr); - objectInfos.add(objInfo); - - if (logger_.isDebugEnabled()) { - logger_.debug("Found non text object: " + objInfo.toString()); - } - } - } + doExtractFromResources(objectInfos, pageNr, page); + //doExtractAnnotations(objectInfos, pageNr, page); // does not work with pdf-box 0.7.2 -> 0.8.0 needed } if (logger_.isDebugEnabled()) { logger_.debug("extracted non textual objects count: " + objectInfos.size()); @@ -275,6 +259,63 @@ public class TextualSignature } + private static void doExtractAnnotations(List objectInfos, int pageNr, PDPage page) { + List annotations; + try { + annotations = page.getAnnotations(); + } catch (IOException e) { + logger_.error("Error extracting annotations from pdf. No NonTextObjectInfo-annotations available.", e); + return; + } + for (Iterator it = annotations.iterator(); it.hasNext();) { + try { + PDAnnotation anno = (PDAnnotation) it.next(); + NonTextObjectInfo objInfo = new NonTextObjectInfo(); + objInfo.setName(anno.getDictionary().getString( "NM" )); + objInfo.setObjectType(NonTextObjectInfo.TYPE_ANNOTATION); + objInfo.setSubType(anno.getDictionary().getString("Subtype") + + "/" + anno.getDictionary().getString("Subj")); + objInfo.setPageNr(pageNr); + objInfo.setHeight(anno.getRectangle().getHeight()); + objInfo.setWidth(anno.getRectangle().getWidth()); + } catch (Exception ex) { + logger_.info("error reading non text object info key " + ex); + } + } + } + + private static void doExtractFromResources(List objectInfos, int pageNr, PDPage page) { + PDResources resources = page.getResources(); + + Map images; + try { + images = resources.getImages(); + } catch (IOException e) { + logger_.error("Error extracting images from pdf. No NonTextObjectInfo-Images available.", e); + return; + } + if (images != null) { + Iterator imageIter = images.keySet().iterator(); + while (imageIter.hasNext()) { + NonTextObjectInfo objInfo = new NonTextObjectInfo(); + String key = (String) imageIter.next(); + PDXObjectImage image = (PDXObjectImage) images.get(key); + System.err.println(image); + + objInfo.setHeight(image.getHeight()); + objInfo.setWidth(image.getWidth()); + objInfo.setName(key + image.getSuffix()); + objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE); + objInfo.setPageNr(pageNr); + objectInfos.add(objInfo); + + if (logger_.isDebugEnabled()) { + logger_.debug("Found non text object: " + objInfo.toString()); + } + } + } + } + /** * Normalizes a given binary PDF to a version PDFbox can handle correctly. -- cgit v1.2.3