pdf-as-lib maven project files moved to pdf-as-lib

git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c
author: tknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c> 2013-01-09 15:41:29 +0000
committer: tknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c> 2013-01-09 15:41:29 +0000
commit: 535a04fa05f739ec16dd81666e3b0f82dfbd442d (patch)
tree: 0804f301c1a9ceb303a8441b7b29244fc8eb7ff0 /src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java
parent: 1efaf6fd5619dfa95c9d7e8c71eda4c2ffba4998 (diff)
download: pdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.tar.gz
pdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.tar.bz2
pdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.zip
1 files changed, 0 insertions, 233 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java
deleted file mode 100644
index eb7377a..0000000
--- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java
+++ /dev/null
@@ -1,233 +0,0 @@
-/**
- * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
- * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
- * joint initiative of the Federal Chancellery Austria and Graz University of
- * Technology.
- *
- * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
- * the European Commission - subsequent versions of the EUPL (the "Licence");
- * You may not use this work except in compliance with the Licence.
- * You may obtain a copy of the Licence at:
- * http://www.osor.eu/eupl/
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the Licence is distributed on an "AS IS" basis,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the Licence for the specific language governing permissions and
- * limitations under the Licence.
- *
- * This product combines work with different licenses. See the "NOTICE" text
- * file for details on the various modules and licenses.
- * The "NOTICE" text file is part of the distribution. Any derivative works
- * that you distribute must include a readable copy of the "NOTICE" text file.
- */
-package at.knowcenter.wag.egov.egiz.pdf;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.log4j.Logger;
-import org.apache.pdfbox.cos.COSDictionary;
-import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.pdmodel.PDResources;
-import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
-
-import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo;
-import at.gv.egiz.pdfas.framework.input.PdfDataSource;
-
-/**
- * Method for object extraction from pdf documents.
- * This uses pdf-box 0.8.0, not 0.7.2. Packages org.apache.pdfbox instead org.pdfbox!
- * @author dferbas
- *
- */
-public class ObjectExtractor {
-   private static Logger log = Logger.getLogger(ObjectExtractor.class);
-   
-   /*
-    * If set <code>true</code> signature annotations are not extracted otherwise
-    * all signatures except PDF-AS signatures are extracted.
-    */
-   private final static boolean SKIP_NON_PDFAS_SIGNATURES = false;
-
-   /**
-    * Find annotation objects in pdf documents
-    * @param objectInfos
-    * @param pageNr
-    * @param page
-    */
-   private static void doExtractAnnotations(List objectInfos, int pageNr, PDPage page) {
-        List annotations; 
-        try {
-           annotations = page.getAnnotations();
-        } catch (IOException e) {
-           log.error("Error extracting annotations from pdf. No NonTextObjectInfo-annotations available.", e);
-           return;
-        }
-        for (Iterator it = annotations.iterator(); it.hasNext();) {
-           try {              
-              PDAnnotation anno = (PDAnnotation) it.next();
-              log.debug("found annotation: " +anno);
-              if (log.isTraceEnabled()) {
-                 log.trace("annotation def: " + dictToString(anno.getDictionary()));
-              }
-              String ft = anno.getDictionary().getNameAsString("FT");
-              if (ft != null && ft.equals("Sig")) {  // skip signature widgets
-                 if (SKIP_NON_PDFAS_SIGNATURES) {
-                    log.debug("found signature widged, skip extraction");
-                    continue;
-                 } else {
-                    COSDictionary sigDict = (COSDictionary) anno.getDictionary().getDictionaryObject("V");
-                    if (sigDict != null && AdobeSignatureHelper.ADOBE_SIG_FILTER.equals(sigDict.getNameAsString("Filter"))) {
-                       log.debug("found PDF-AS signature widged, skip extraction");
-                       continue;
-                    }
-                 }
-              }
-
-              NonTextObjectInfo objInfo = new NonTextObjectInfo();
-              objInfo.setName(anno.getDictionary().getString( "NM" ));
-              objInfo.setObjectType(NonTextObjectInfo.TYPE_ANNOTATION);
-
-              String subtype = anno.getDictionary().getNameAsString("Subtype");
-              String subj = anno.getDictionary().getString("Subj");
-              if (subj != null) {
-                subtype+= "/" + subj; 
-              }
-              objInfo.setSubType(subtype);
-              
-              objInfo.setPageNr(pageNr);
-              objInfo.setHeight(anno.getRectangle().getHeight());
-              objInfo.setWidth(anno.getRectangle().getWidth());
-              objectInfos.add(objInfo);              
-           } catch (Exception ex) {
-              log.info("error reading non text object info key " + ex);
-           }
-        }   
-     }
-   
-   /**
-    * Create string representation from COSDictionary
-    * @param dict
-    * @return
-    */
-   public static String dictToString(COSDictionary dict)
-   {
-       try {
-         String retVal = "COSDictionary{";
-          for (int i = 0; i<dict.size(); i++)
-          {
-              COSName key = (COSName)dict.keyList().get(i);           
-              retVal = retVal + "(" + key + ":" + dict.getItem(key) + ") ";
-          }
-          retVal = retVal + "}";
-          return retVal;
-      } catch (Exception e) {
-         return "no detail available";
-      }
-   }
-
-   /**
-    * Find resources (images) in pdf documents
-    * @param objectInfos
-    * @param pageNr
-    * @param page
-    */
-   private static void doExtractFromResources(List objectInfos, int pageNr, PDPage page) {
-        PDResources resources = page.getResources();
-    
-        Map images;
-        try {
-           images = resources.getImages();
-        } catch (IOException e) {
-           log.error("Error extracting images from pdf. No NonTextObjectInfo-Images available.", e);
-           return;
-        }
-        if (images != null) {
-           Iterator imageIter = images.keySet().iterator();
-           while (imageIter.hasNext()) {
-              NonTextObjectInfo objInfo = new NonTextObjectInfo();
-              String key = (String) imageIter.next();
-              PDXObjectImage image = (PDXObjectImage) images.get(key);
-              
-              objInfo.setHeight(image.getHeight());
-              objInfo.setWidth(image.getWidth());
-              objInfo.setName(key + image.getSuffix());
-              objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE);
-              objInfo.setPageNr(pageNr);
-              objectInfos.add(objInfo);
-              objInfo.setSubType(image.getSuffix());
-              
-              if (log.isDebugEnabled()) {
-                 log.debug("Found non text object: " + objInfo.toString());
-              }
-           }
-        }     
-     }
-
-   /**
-      * Extract non textual data from pdf. 
-      * 
-      * @see org.pdfbox.ExtractImages
-      * 
-      * @param document
-      * @return List of {@link NonTextObjectInfo}
-      */
-     public static List extractNonTextInfo(PDDocument document) {         
-        // extraction does not work with the normalized pdf from extractTextTextual
-        log.debug("going to extract non text objects");
-        List objectInfos = new ArrayList();
-        List pages = document.getDocumentCatalog().getAllPages();
-        Iterator iter = pages.iterator();
-        int pageNr = 0;
-        while (iter.hasNext()) {
-           pageNr++;
-           PDPage page = (PDPage) iter.next();
-           doExtractFromResources(objectInfos, pageNr, page);
-           doExtractAnnotations(objectInfos, pageNr, page); // does not work with pdf-box 0.7.2 -> 0.8.0 needed
-        }
-        if (log.isDebugEnabled()) {
-           log.debug("extracted non textual objects count: " + objectInfos.size());
-        }
-        return objectInfos;
-   
-     }
-
-   /**
-      * Extract non textual data from pdf. 
-    * @param pdfDataSource 
-      * 
-      * @see org.pdfbox.ExtractImages
-      * 
-      * @return List of {@link NonTextObjectInfo}
-      */
-      public static List extractNonTextInfo(PdfDataSource pdfDataSource) {
-   
-         PDDocument doc = null;
-         try {
-            doc = PDDocument.load(pdfDataSource.createInputStream());
-            List res = extractNonTextInfo(doc);
-            doc.close();
-            return res;
-         } catch (IOException e) {
-            log.error("Error extracting images from pdf. No NonTextObjectInfo available.", e);
-            return new ArrayList();
-         } finally {
-            if (doc != null) {
-               try {
-                  doc.close();
-               } catch (IOException e) {
-                  log.error("error closing pddocument", e);
-               }            
-            }
-         }
-         
-      }
-
-}
author	tknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>	2013-01-09 15:41:29 +0000
committer	tknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>	2013-01-09 15:41:29 +0000
commit	535a04fa05f739ec16dd81666e3b0f82dfbd442d (patch)
tree	0804f301c1a9ceb303a8441b7b29244fc8eb7ff0 /src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java
parent	1efaf6fd5619dfa95c9d7e8c71eda4c2ffba4998 (diff)
download	pdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.tar.gz pdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.tar.bz2 pdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.zip