diff options
author | ferbas <ferbas@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c> | 2009-11-26 09:53:03 +0000 |
---|---|---|
committer | ferbas <ferbas@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c> | 2009-11-26 09:53:03 +0000 |
commit | 0f3ea4c6a974c3ca8907d43d3353ef39ca21ecb1 (patch) | |
tree | 187fc0d3bbb6e140920c09881ac8331738cc25c0 | |
parent | f6882cf38725a930b3e60db5289da9f35f9e7477 (diff) | |
download | pdf-as-3-0f3ea4c6a974c3ca8907d43d3353ef39ca21ecb1.tar.gz pdf-as-3-0f3ea4c6a974c3ca8907d43d3353ef39ca21ecb1.tar.bz2 pdf-as-3-0f3ea4c6a974c3ca8907d43d3353ef39ca21ecb1.zip |
improved exception handling
added annotation extraction (deactivated, needs pdfbox 0.8.0)
git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@464 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c
-rw-r--r-- | src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java | 97 |
1 files changed, 69 insertions, 28 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java index 7f567c4..d7fbe64 100644 --- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java +++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java @@ -32,6 +32,7 @@ import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDPage;
import org.pdfbox.pdmodel.PDResources;
import org.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
+import org.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.pdfbox.util.PDFTextStripper;
import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo;
@@ -91,6 +92,11 @@ public class TextualSignature byte [] pdf_data = pdfDataSource.getAsByteArray();
PdfReader reader = new PdfReader(pdf_data);
+ if (!reader.isOpenedWithFullPermissions()) {
+ // cannot perform modification and extraction
+ throw new TextExtractionException("Document is protected");
+
+ }
//pdf_stream.close();
// PERF: PDF normalization needs byte array - this is costy
@@ -179,6 +185,10 @@ public class TextualSignature return text;
}
+ catch (IllegalArgumentException e)
+ {
+ throw new TextExtractionException(e);
+ }
catch (IOException e)
{
throw new TextExtractionException(e);
@@ -238,34 +248,8 @@ public class TextualSignature while (iter.hasNext()) {
pageNr++;
PDPage page = (PDPage) iter.next();
- PDResources resources = page.getResources();
-
- Map images;
- try {
- images = resources.getImages();
- } catch (IOException e) {
- logger_.error("Error extracting images from pdf. No NonTextObjectInfo available.", e);
- return objectInfos;
- }
- if (images != null) {
- Iterator imageIter = images.keySet().iterator();
- while (imageIter.hasNext()) {
- NonTextObjectInfo objInfo = new NonTextObjectInfo();
- String key = (String) imageIter.next();
- PDXObjectImage image = (PDXObjectImage) images.get(key);
-
- objInfo.setHeight(image.getHeight());
- objInfo.setWidth(image.getWidth());
- objInfo.setName(key + image.getSuffix());
- objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE);
- objInfo.setPageNr(pageNr);
- objectInfos.add(objInfo);
-
- if (logger_.isDebugEnabled()) {
- logger_.debug("Found non text object: " + objInfo.toString());
- }
- }
- }
+ doExtractFromResources(objectInfos, pageNr, page);
+ //doExtractAnnotations(objectInfos, pageNr, page); // does not work with pdf-box 0.7.2 -> 0.8.0 needed
}
if (logger_.isDebugEnabled()) {
logger_.debug("extracted non textual objects count: " + objectInfos.size());
@@ -275,6 +259,63 @@ public class TextualSignature }
+ private static void doExtractAnnotations(List objectInfos, int pageNr, PDPage page) {
+ List annotations;
+ try {
+ annotations = page.getAnnotations();
+ } catch (IOException e) {
+ logger_.error("Error extracting annotations from pdf. No NonTextObjectInfo-annotations available.", e);
+ return;
+ }
+ for (Iterator it = annotations.iterator(); it.hasNext();) {
+ try {
+ PDAnnotation anno = (PDAnnotation) it.next();
+ NonTextObjectInfo objInfo = new NonTextObjectInfo();
+ objInfo.setName(anno.getDictionary().getString( "NM" ));
+ objInfo.setObjectType(NonTextObjectInfo.TYPE_ANNOTATION);
+ objInfo.setSubType(anno.getDictionary().getString("Subtype") +
+ "/" + anno.getDictionary().getString("Subj"));
+ objInfo.setPageNr(pageNr);
+ objInfo.setHeight(anno.getRectangle().getHeight());
+ objInfo.setWidth(anno.getRectangle().getWidth());
+ } catch (Exception ex) {
+ logger_.info("error reading non text object info key " + ex);
+ }
+ }
+ }
+
+ private static void doExtractFromResources(List objectInfos, int pageNr, PDPage page) {
+ PDResources resources = page.getResources();
+
+ Map images;
+ try {
+ images = resources.getImages();
+ } catch (IOException e) {
+ logger_.error("Error extracting images from pdf. No NonTextObjectInfo-Images available.", e);
+ return;
+ }
+ if (images != null) {
+ Iterator imageIter = images.keySet().iterator();
+ while (imageIter.hasNext()) {
+ NonTextObjectInfo objInfo = new NonTextObjectInfo();
+ String key = (String) imageIter.next();
+ PDXObjectImage image = (PDXObjectImage) images.get(key);
+ System.err.println(image);
+
+ objInfo.setHeight(image.getHeight());
+ objInfo.setWidth(image.getWidth());
+ objInfo.setName(key + image.getSuffix());
+ objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE);
+ objInfo.setPageNr(pageNr);
+ objectInfos.add(objInfo);
+
+ if (logger_.isDebugEnabled()) {
+ logger_.debug("Found non text object: " + objInfo.toString());
+ }
+ }
+ }
+ }
+
/**
* Normalizes a given binary PDF to a version PDFbox can handle correctly.
|