diff options
| author | ferbas <ferbas@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c> | 2009-11-26 09:53:03 +0000 | 
|---|---|---|
| committer | ferbas <ferbas@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c> | 2009-11-26 09:53:03 +0000 | 
| commit | 0f3ea4c6a974c3ca8907d43d3353ef39ca21ecb1 (patch) | |
| tree | 187fc0d3bbb6e140920c09881ac8331738cc25c0 /src/main/java | |
| parent | f6882cf38725a930b3e60db5289da9f35f9e7477 (diff) | |
| download | pdf-as-3-0f3ea4c6a974c3ca8907d43d3353ef39ca21ecb1.tar.gz pdf-as-3-0f3ea4c6a974c3ca8907d43d3353ef39ca21ecb1.tar.bz2 pdf-as-3-0f3ea4c6a974c3ca8907d43d3353ef39ca21ecb1.zip | |
improved exception handling
added annotation extraction (deactivated, needs pdfbox 0.8.0)
git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@464 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c
Diffstat (limited to 'src/main/java')
| -rw-r--r-- | src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java | 97 | 
1 files changed, 69 insertions, 28 deletions
| diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java index 7f567c4..d7fbe64 100644 --- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java +++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java @@ -32,6 +32,7 @@ import org.pdfbox.pdmodel.PDDocument;  import org.pdfbox.pdmodel.PDPage;
  import org.pdfbox.pdmodel.PDResources;
  import org.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
 +import org.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
  import org.pdfbox.util.PDFTextStripper;
  import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo;
 @@ -91,6 +92,11 @@ public class TextualSignature          byte [] pdf_data = pdfDataSource.getAsByteArray();
          PdfReader reader = new PdfReader(pdf_data);
 +        if (!reader.isOpenedWithFullPermissions()) {
 +           // cannot perform modification and extraction
 +           throw new TextExtractionException("Document is protected");
 +           
 +        }
          //pdf_stream.close();
          // PERF: PDF normalization needs byte array - this is costy
 @@ -179,6 +185,10 @@ public class TextualSignature        return text;
      }
 +    catch (IllegalArgumentException e)
 +    {
 +      throw new TextExtractionException(e);
 +    }
      catch (IOException e)
      {
        throw new TextExtractionException(e);
 @@ -238,34 +248,8 @@ public class TextualSignature       while (iter.hasNext()) {
          pageNr++;
          PDPage page = (PDPage) iter.next();
 -        PDResources resources = page.getResources();
 -
 -        Map images;
 -         try {
 -            images = resources.getImages();
 -         } catch (IOException e) {
 -            logger_.error("Error extracting images from pdf. No NonTextObjectInfo available.", e);
 -            return objectInfos;
 -         }
 -        if (images != null) {
 -           Iterator imageIter = images.keySet().iterator();
 -           while (imageIter.hasNext()) {
 -              NonTextObjectInfo objInfo = new NonTextObjectInfo();
 -              String key = (String) imageIter.next();
 -              PDXObjectImage image = (PDXObjectImage) images.get(key);
 -              
 -              objInfo.setHeight(image.getHeight());
 -              objInfo.setWidth(image.getWidth());
 -              objInfo.setName(key + image.getSuffix());
 -              objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE);
 -              objInfo.setPageNr(pageNr);
 -              objectInfos.add(objInfo);
 -              
 -              if (logger_.isDebugEnabled()) {
 -                 logger_.debug("Found non text object: " + objInfo.toString());
 -              }
 -           }
 -        }
 +        doExtractFromResources(objectInfos, pageNr, page);
 +        //doExtractAnnotations(objectInfos, pageNr, page); // does not work with pdf-box 0.7.2 -> 0.8.0 needed
       }
       if (logger_.isDebugEnabled()) {
          logger_.debug("extracted non textual objects count: " + objectInfos.size());
 @@ -275,6 +259,63 @@ public class TextualSignature    }
 +  private static void doExtractAnnotations(List objectInfos, int pageNr, PDPage page) {
 +     List annotations; 
 +     try {
 +        annotations = page.getAnnotations();
 +     } catch (IOException e) {
 +        logger_.error("Error extracting annotations from pdf. No NonTextObjectInfo-annotations available.", e);
 +        return;
 +     }
 +     for (Iterator it = annotations.iterator(); it.hasNext();) {
 +        try {
 +           PDAnnotation anno = (PDAnnotation) it.next();
 +           NonTextObjectInfo objInfo = new NonTextObjectInfo();
 +           objInfo.setName(anno.getDictionary().getString( "NM" ));
 +           objInfo.setObjectType(NonTextObjectInfo.TYPE_ANNOTATION);
 +           objInfo.setSubType(anno.getDictionary().getString("Subtype") + 
 +                 "/" + anno.getDictionary().getString("Subj"));
 +           objInfo.setPageNr(pageNr);
 +           objInfo.setHeight(anno.getRectangle().getHeight());
 +           objInfo.setWidth(anno.getRectangle().getWidth());
 +        } catch (Exception ex) {
 +           logger_.info("error reading non text object info key " + ex);
 +        }
 +     }   
 +  }
 +
 +  private static void doExtractFromResources(List objectInfos, int pageNr, PDPage page) {
 +     PDResources resources = page.getResources();
 + 
 +     Map images;
 +     try {
 +        images = resources.getImages();
 +     } catch (IOException e) {
 +        logger_.error("Error extracting images from pdf. No NonTextObjectInfo-Images available.", e);
 +        return;
 +     }
 +     if (images != null) {
 +        Iterator imageIter = images.keySet().iterator();
 +        while (imageIter.hasNext()) {
 +           NonTextObjectInfo objInfo = new NonTextObjectInfo();
 +           String key = (String) imageIter.next();
 +           PDXObjectImage image = (PDXObjectImage) images.get(key);
 +           System.err.println(image);
 +           
 +           objInfo.setHeight(image.getHeight());
 +           objInfo.setWidth(image.getWidth());
 +           objInfo.setName(key + image.getSuffix());
 +           objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE);
 +           objInfo.setPageNr(pageNr);
 +           objectInfos.add(objInfo);
 +           
 +           if (logger_.isDebugEnabled()) {
 +              logger_.debug("Found non text object: " + objInfo.toString());
 +           }
 +        }
 +     }     
 +  }
 +
    /**
     * Normalizes a given binary PDF to a version PDFbox can handle correctly.
 | 
