aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/at/knowcenter/wag/egov
diff options
context:
space:
mode:
authorferbas <ferbas@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>2009-12-02 16:49:34 +0000
committerferbas <ferbas@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>2009-12-02 16:49:34 +0000
commitbaa99742a8faf1330fe073ab95f8dee7d4d30aec (patch)
tree30996653d5ccbc511a685e16e42e9700dbec10cd /src/main/java/at/knowcenter/wag/egov
parent0d9d384bfd15cce65d4077ae11951f3ce7f14dd6 (diff)
downloadpdf-as-3-baa99742a8faf1330fe073ab95f8dee7d4d30aec.tar.gz
pdf-as-3-baa99742a8faf1330fe073ab95f8dee7d4d30aec.tar.bz2
pdf-as-3-baa99742a8faf1330fe073ab95f8dee7d4d30aec.zip
moved nontextobject extraction to objectextractor
git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@476 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c
Diffstat (limited to 'src/main/java/at/knowcenter/wag/egov')
-rw-r--r--src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java127
1 files changed, 0 insertions, 127 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
index d7fbe64..d5b3c5d 100644
--- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
+++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
@@ -21,21 +21,12 @@ import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
import org.apache.log4j.Logger;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
-import org.pdfbox.pdmodel.PDPage;
-import org.pdfbox.pdmodel.PDResources;
-import org.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
-import org.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.pdfbox.util.PDFTextStripper;
-import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo;
import at.gv.egiz.pdfas.exceptions.pdf.TextExtractionException;
import at.gv.egiz.pdfas.framework.input.PdfDataSource;
import at.gv.egiz.pdfas.performance.PerformanceCounters;
@@ -200,124 +191,6 @@ public class TextualSignature
}
/**
- * Extract non textual data from pdf.
- * @param pdfDataSource
- *
- * @see org.pdfbox.ExtractImages
- *
- * @return List of {@link NonTextObjectInfo}
- */
- public static List extractNonTextInfo(PdfDataSource pdfDataSource) {
-
- PDDocument doc = null;
- try {
- doc = PDDocument.load(pdfDataSource.createInputStream());
- List res = extractNonTextInfo(doc);
- doc.close();
- return res;
- } catch (IOException e) {
- logger_.error("Error extracting images from pdf. No NonTextObjectInfo available.", e);
- return new ArrayList();
- } finally {
- if (doc != null) {
- try {
- doc.close();
- } catch (IOException e) {
- logger_.error("error closing pddocument", e);
- }
- }
- }
-
- }
-
- /**
- * Extract non textual data from pdf.
- *
- * @see org.pdfbox.ExtractImages
- *
- * @param document
- * @return List of {@link NonTextObjectInfo}
- */
- public static List extractNonTextInfo(PDDocument document) {
- // extraction does not work with the normalized pdf from extractTextTextual
- logger_.debug("going to extract non text objects");
- List objectInfos = new ArrayList();
- List pages = document.getDocumentCatalog().getAllPages();
- Iterator iter = pages.iterator();
- int pageNr = 0;
- while (iter.hasNext()) {
- pageNr++;
- PDPage page = (PDPage) iter.next();
- doExtractFromResources(objectInfos, pageNr, page);
- //doExtractAnnotations(objectInfos, pageNr, page); // does not work with pdf-box 0.7.2 -> 0.8.0 needed
- }
- if (logger_.isDebugEnabled()) {
- logger_.debug("extracted non textual objects count: " + objectInfos.size());
- }
- logger_.debug("going to extract non text objects");
- return objectInfos;
-
- }
-
- private static void doExtractAnnotations(List objectInfos, int pageNr, PDPage page) {
- List annotations;
- try {
- annotations = page.getAnnotations();
- } catch (IOException e) {
- logger_.error("Error extracting annotations from pdf. No NonTextObjectInfo-annotations available.", e);
- return;
- }
- for (Iterator it = annotations.iterator(); it.hasNext();) {
- try {
- PDAnnotation anno = (PDAnnotation) it.next();
- NonTextObjectInfo objInfo = new NonTextObjectInfo();
- objInfo.setName(anno.getDictionary().getString( "NM" ));
- objInfo.setObjectType(NonTextObjectInfo.TYPE_ANNOTATION);
- objInfo.setSubType(anno.getDictionary().getString("Subtype") +
- "/" + anno.getDictionary().getString("Subj"));
- objInfo.setPageNr(pageNr);
- objInfo.setHeight(anno.getRectangle().getHeight());
- objInfo.setWidth(anno.getRectangle().getWidth());
- } catch (Exception ex) {
- logger_.info("error reading non text object info key " + ex);
- }
- }
- }
-
- private static void doExtractFromResources(List objectInfos, int pageNr, PDPage page) {
- PDResources resources = page.getResources();
-
- Map images;
- try {
- images = resources.getImages();
- } catch (IOException e) {
- logger_.error("Error extracting images from pdf. No NonTextObjectInfo-Images available.", e);
- return;
- }
- if (images != null) {
- Iterator imageIter = images.keySet().iterator();
- while (imageIter.hasNext()) {
- NonTextObjectInfo objInfo = new NonTextObjectInfo();
- String key = (String) imageIter.next();
- PDXObjectImage image = (PDXObjectImage) images.get(key);
- System.err.println(image);
-
- objInfo.setHeight(image.getHeight());
- objInfo.setWidth(image.getWidth());
- objInfo.setName(key + image.getSuffix());
- objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE);
- objInfo.setPageNr(pageNr);
- objectInfos.add(objInfo);
-
- if (logger_.isDebugEnabled()) {
- logger_.debug("Found non text object: " + objInfo.toString());
- }
- }
- }
- }
-
-
- /**
* Normalizes a given binary PDF to a version PDFbox can handle correctly.
*
* <p>