diff options
| author | ferbas <ferbas@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c> | 2009-12-02 16:47:37 +0000 | 
|---|---|---|
| committer | ferbas <ferbas@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c> | 2009-12-02 16:47:37 +0000 | 
| commit | 0d9d384bfd15cce65d4077ae11951f3ce7f14dd6 (patch) | |
| tree | 4bd136a8b64ed6da31c0f69ee3eefb25a7139e44 | |
| parent | fa05e97f3e4343694041be24e8ff779a76cd36f6 (diff) | |
| download | pdf-as-3-0d9d384bfd15cce65d4077ae11951f3ce7f14dd6.tar.gz pdf-as-3-0d9d384bfd15cce65d4077ae11951f3ce7f14dd6.tar.bz2 pdf-as-3-0d9d384bfd15cce65d4077ae11951f3ce7f14dd6.zip | |
added annotation extraction with pdfbox 0.8.0
git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@475 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c
| -rw-r--r-- | src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java | 171 | 
1 files changed, 171 insertions, 0 deletions
| diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java new file mode 100644 index 0000000..f19a2f4 --- /dev/null +++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java @@ -0,0 +1,171 @@ +package at.knowcenter.wag.egov.egiz.pdf;
 +
 +import java.io.IOException;
 +import java.util.ArrayList;
 +import java.util.Iterator;
 +import java.util.List;
 +import java.util.Map;
 +
 +import org.apache.log4j.Logger;
 +import org.apache.pdfbox.pdmodel.PDDocument;
 +import org.apache.pdfbox.pdmodel.PDPage;
 +import org.apache.pdfbox.pdmodel.PDResources;
 +import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
 +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
 +
 +import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo;
 +import at.gv.egiz.pdfas.framework.input.PdfDataSource;
 +
 +/**
 + * Method for object extraction from pdf documents.
 + * This uses pdf-box 0.8.0, not 0.7.2. Packages org.apache.pdfbox instead org.pdfbox!
 + * @author dferbas
 + *
 + */
 +public class ObjectExtractor {
 +   private static Logger log = Logger.getLogger(ObjectExtractor.class);
 +
 +   /**
 +    * Find annotation objects in pdf documents
 +    * @param objectInfos
 +    * @param pageNr
 +    * @param page
 +    */
 +   private static void doExtractAnnotations(List objectInfos, int pageNr, PDPage page) {
 +        List annotations; 
 +        try {
 +           annotations = page.getAnnotations();
 +        } catch (IOException e) {
 +           log.error("Error extracting annotations from pdf. No NonTextObjectInfo-annotations available.", e);
 +           return;
 +        }
 +        for (Iterator it = annotations.iterator(); it.hasNext();) {
 +           try {              
 +              PDAnnotation anno = (PDAnnotation) it.next();
 +              log.debug("found annotation: " +anno);
 +              String ft = anno.getDictionary().getNameAsString("FT");              
 +              if (ft != null && ft.equals("Sig")) {  // skip signature widgets
 +                 log.debug("found signature widged, skip further extraction");
 +                 break;
 +              }             
 +              NonTextObjectInfo objInfo = new NonTextObjectInfo();
 +              objInfo.setName(anno.getDictionary().getString( "NM" ));
 +              objInfo.setObjectType(NonTextObjectInfo.TYPE_ANNOTATION);
 +
 +              String subtype = anno.getDictionary().getNameAsString("Subtype");
 +              String subj = anno.getDictionary().getString("Subj");
 +              if (subj != null) {
 +                subtype+= "/" + subj; 
 +              }
 +              objInfo.setSubType(subtype);
 +              
 +              objInfo.setPageNr(pageNr);
 +              objInfo.setHeight(anno.getRectangle().getHeight());
 +              objInfo.setWidth(anno.getRectangle().getWidth());
 +              objectInfos.add(objInfo);              
 +           } catch (Exception ex) {
 +              log.info("error reading non text object info key " + ex);
 +           }
 +        }   
 +     }
 +
 +   /**
 +    * Find resources (images) in pdf documents
 +    * @param objectInfos
 +    * @param pageNr
 +    * @param page
 +    */
 +   private static void doExtractFromResources(List objectInfos, int pageNr, PDPage page) {
 +        PDResources resources = page.getResources();
 +    
 +        Map images;
 +        try {
 +           images = resources.getImages();
 +        } catch (IOException e) {
 +           log.error("Error extracting images from pdf. No NonTextObjectInfo-Images available.", e);
 +           return;
 +        }
 +        if (images != null) {
 +           Iterator imageIter = images.keySet().iterator();
 +           while (imageIter.hasNext()) {
 +              NonTextObjectInfo objInfo = new NonTextObjectInfo();
 +              String key = (String) imageIter.next();
 +              PDXObjectImage image = (PDXObjectImage) images.get(key);
 +              System.err.println(image);
 +              
 +              objInfo.setHeight(image.getHeight());
 +              objInfo.setWidth(image.getWidth());
 +              objInfo.setName(key + image.getSuffix());
 +              objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE);
 +              objInfo.setPageNr(pageNr);
 +              objectInfos.add(objInfo);
 +              objInfo.setSubType(image.getSuffix());
 +              
 +              if (log.isDebugEnabled()) {
 +                 log.debug("Found non text object: " + objInfo.toString());
 +              }
 +           }
 +        }     
 +     }
 +
 +   /**
 +      * Extract non textual data from pdf. 
 +      * 
 +      * @see org.pdfbox.ExtractImages
 +      * 
 +      * @param document
 +      * @return List of {@link NonTextObjectInfo}
 +      */
 +     public static List extractNonTextInfo(PDDocument document) {         
 +        // extraction does not work with the normalized pdf from extractTextTextual
 +        log.debug("going to extract non text objects");
 +        List objectInfos = new ArrayList();
 +        List pages = document.getDocumentCatalog().getAllPages();
 +        Iterator iter = pages.iterator();
 +        int pageNr = 0;
 +        while (iter.hasNext()) {
 +           pageNr++;
 +           PDPage page = (PDPage) iter.next();
 +           doExtractFromResources(objectInfos, pageNr, page);
 +           doExtractAnnotations(objectInfos, pageNr, page); // does not work with pdf-box 0.7.2 -> 0.8.0 needed
 +        }
 +        if (log.isDebugEnabled()) {
 +           log.debug("extracted non textual objects count: " + objectInfos.size());
 +        }
 +        log.debug("going to extract non text objects");
 +        return objectInfos;
 +   
 +     }
 +
 +   /**
 +      * Extract non textual data from pdf. 
 +    * @param pdfDataSource 
 +      * 
 +      * @see org.pdfbox.ExtractImages
 +      * 
 +      * @return List of {@link NonTextObjectInfo}
 +      */
 +      public static List extractNonTextInfo(PdfDataSource pdfDataSource) {
 +   
 +         PDDocument doc = null;
 +         try {
 +            doc = PDDocument.load(pdfDataSource.createInputStream());
 +            List res = extractNonTextInfo(doc);
 +            doc.close();
 +            return res;
 +         } catch (IOException e) {
 +            log.error("Error extracting images from pdf. No NonTextObjectInfo available.", e);
 +            return new ArrayList();
 +         } finally {
 +            if (doc != null) {
 +               try {
 +                  doc.close();
 +               } catch (IOException e) {
 +                  log.error("error closing pddocument", e);
 +               }            
 +            }
 +         }
 +         
 +      }
 +
 +}
 | 
