diff options
| author | ferbas <ferbas@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c> | 2009-12-02 16:49:34 +0000 | 
|---|---|---|
| committer | ferbas <ferbas@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c> | 2009-12-02 16:49:34 +0000 | 
| commit | baa99742a8faf1330fe073ab95f8dee7d4d30aec (patch) | |
| tree | 30996653d5ccbc511a685e16e42e9700dbec10cd /src/main/java | |
| parent | 0d9d384bfd15cce65d4077ae11951f3ce7f14dd6 (diff) | |
| download | pdf-as-3-baa99742a8faf1330fe073ab95f8dee7d4d30aec.tar.gz pdf-as-3-baa99742a8faf1330fe073ab95f8dee7d4d30aec.tar.bz2 pdf-as-3-baa99742a8faf1330fe073ab95f8dee7d4d30aec.zip | |
moved nontextobject extraction to objectextractor
git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@476 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c
Diffstat (limited to 'src/main/java')
| -rw-r--r-- | src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java | 127 | 
1 files changed, 0 insertions, 127 deletions
| diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java index d7fbe64..d5b3c5d 100644 --- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java +++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java @@ -21,21 +21,12 @@ import java.io.ByteArrayInputStream;  import java.io.ByteArrayOutputStream;
  import java.io.File;
  import java.io.IOException;
 -import java.util.ArrayList;
 -import java.util.Iterator;
 -import java.util.List;
 -import java.util.Map;
  import org.apache.log4j.Logger;
  import org.pdfbox.pdfparser.PDFParser;
  import org.pdfbox.pdmodel.PDDocument;
 -import org.pdfbox.pdmodel.PDPage;
 -import org.pdfbox.pdmodel.PDResources;
 -import org.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
 -import org.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
  import org.pdfbox.util.PDFTextStripper;
 -import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo;
  import at.gv.egiz.pdfas.exceptions.pdf.TextExtractionException;
  import at.gv.egiz.pdfas.framework.input.PdfDataSource;
  import at.gv.egiz.pdfas.performance.PerformanceCounters;
 @@ -200,124 +191,6 @@ public class TextualSignature    }
    /**
 -   * Extract non textual data from pdf. 
 - * @param pdfDataSource 
 -   * 
 -   * @see org.pdfbox.ExtractImages
 -   * 
 -   * @return List of {@link NonTextObjectInfo}
 -   */
 -   public static List extractNonTextInfo(PdfDataSource pdfDataSource) {
 -
 -      PDDocument doc = null;
 -      try {
 -         doc = PDDocument.load(pdfDataSource.createInputStream());
 -         List res = extractNonTextInfo(doc);
 -         doc.close();
 -         return res;
 -      } catch (IOException e) {
 -         logger_.error("Error extracting images from pdf. No NonTextObjectInfo available.", e);
 -         return new ArrayList();
 -      } finally {
 -         if (doc != null) {
 -            try {
 -               doc.close();
 -            } catch (IOException e) {
 -               logger_.error("error closing pddocument", e);
 -            }            
 -         }
 -      }
 -      
 -   }
 -  
 -  /**
 -   * Extract non textual data from pdf. 
 -   * 
 -   * @see org.pdfbox.ExtractImages
 -   * 
 -   * @param document
 -   * @return List of {@link NonTextObjectInfo}
 -   */
 -  public static List extractNonTextInfo(PDDocument document) {         
 -     // extraction does not work with the normalized pdf from extractTextTextual
 -     logger_.debug("going to extract non text objects");
 -     List objectInfos = new ArrayList();
 -     List pages = document.getDocumentCatalog().getAllPages();
 -     Iterator iter = pages.iterator();
 -     int pageNr = 0;
 -     while (iter.hasNext()) {
 -        pageNr++;
 -        PDPage page = (PDPage) iter.next();
 -        doExtractFromResources(objectInfos, pageNr, page);
 -        //doExtractAnnotations(objectInfos, pageNr, page); // does not work with pdf-box 0.7.2 -> 0.8.0 needed
 -     }
 -     if (logger_.isDebugEnabled()) {
 -        logger_.debug("extracted non textual objects count: " + objectInfos.size());
 -     }
 -     logger_.debug("going to extract non text objects");
 -     return objectInfos;
 -
 -  }
 -
 -  private static void doExtractAnnotations(List objectInfos, int pageNr, PDPage page) {
 -     List annotations; 
 -     try {
 -        annotations = page.getAnnotations();
 -     } catch (IOException e) {
 -        logger_.error("Error extracting annotations from pdf. No NonTextObjectInfo-annotations available.", e);
 -        return;
 -     }
 -     for (Iterator it = annotations.iterator(); it.hasNext();) {
 -        try {
 -           PDAnnotation anno = (PDAnnotation) it.next();
 -           NonTextObjectInfo objInfo = new NonTextObjectInfo();
 -           objInfo.setName(anno.getDictionary().getString( "NM" ));
 -           objInfo.setObjectType(NonTextObjectInfo.TYPE_ANNOTATION);
 -           objInfo.setSubType(anno.getDictionary().getString("Subtype") + 
 -                 "/" + anno.getDictionary().getString("Subj"));
 -           objInfo.setPageNr(pageNr);
 -           objInfo.setHeight(anno.getRectangle().getHeight());
 -           objInfo.setWidth(anno.getRectangle().getWidth());
 -        } catch (Exception ex) {
 -           logger_.info("error reading non text object info key " + ex);
 -        }
 -     }   
 -  }
 -
 -  private static void doExtractFromResources(List objectInfos, int pageNr, PDPage page) {
 -     PDResources resources = page.getResources();
 - 
 -     Map images;
 -     try {
 -        images = resources.getImages();
 -     } catch (IOException e) {
 -        logger_.error("Error extracting images from pdf. No NonTextObjectInfo-Images available.", e);
 -        return;
 -     }
 -     if (images != null) {
 -        Iterator imageIter = images.keySet().iterator();
 -        while (imageIter.hasNext()) {
 -           NonTextObjectInfo objInfo = new NonTextObjectInfo();
 -           String key = (String) imageIter.next();
 -           PDXObjectImage image = (PDXObjectImage) images.get(key);
 -           System.err.println(image);
 -           
 -           objInfo.setHeight(image.getHeight());
 -           objInfo.setWidth(image.getWidth());
 -           objInfo.setName(key + image.getSuffix());
 -           objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE);
 -           objInfo.setPageNr(pageNr);
 -           objectInfos.add(objInfo);
 -           
 -           if (logger_.isDebugEnabled()) {
 -              logger_.debug("Found non text object: " + objInfo.toString());
 -           }
 -        }
 -     }     
 -  }
 -
 -
 -  /**
     * Normalizes a given binary PDF to a version PDFbox can handle correctly.
     * 
     * <p>
 | 
