From baa99742a8faf1330fe073ab95f8dee7d4d30aec Mon Sep 17 00:00:00 2001 From: ferbas Date: Wed, 2 Dec 2009 16:49:34 +0000 Subject: moved nontextobject extraction to objectextractor git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@476 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../wag/egov/egiz/pdf/TextualSignature.java | 127 --------------------- 1 file changed, 127 deletions(-) (limited to 'src/main/java/at/knowcenter/wag/egov') diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java index d7fbe64..d5b3c5d 100644 --- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java +++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java @@ -21,21 +21,12 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; import org.apache.log4j.Logger; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; -import org.pdfbox.pdmodel.PDPage; -import org.pdfbox.pdmodel.PDResources; -import org.pdfbox.pdmodel.graphics.xobject.PDXObjectImage; -import org.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.pdfbox.util.PDFTextStripper; -import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo; import at.gv.egiz.pdfas.exceptions.pdf.TextExtractionException; import at.gv.egiz.pdfas.framework.input.PdfDataSource; import at.gv.egiz.pdfas.performance.PerformanceCounters; @@ -199,124 +190,6 @@ public class TextualSignature } } - /** - * Extract non textual data from pdf. - * @param pdfDataSource - * - * @see org.pdfbox.ExtractImages - * - * @return List of {@link NonTextObjectInfo} - */ - public static List extractNonTextInfo(PdfDataSource pdfDataSource) { - - PDDocument doc = null; - try { - doc = PDDocument.load(pdfDataSource.createInputStream()); - List res = extractNonTextInfo(doc); - doc.close(); - return res; - } catch (IOException e) { - logger_.error("Error extracting images from pdf. No NonTextObjectInfo available.", e); - return new ArrayList(); - } finally { - if (doc != null) { - try { - doc.close(); - } catch (IOException e) { - logger_.error("error closing pddocument", e); - } - } - } - - } - - /** - * Extract non textual data from pdf. - * - * @see org.pdfbox.ExtractImages - * - * @param document - * @return List of {@link NonTextObjectInfo} - */ - public static List extractNonTextInfo(PDDocument document) { - // extraction does not work with the normalized pdf from extractTextTextual - logger_.debug("going to extract non text objects"); - List objectInfos = new ArrayList(); - List pages = document.getDocumentCatalog().getAllPages(); - Iterator iter = pages.iterator(); - int pageNr = 0; - while (iter.hasNext()) { - pageNr++; - PDPage page = (PDPage) iter.next(); - doExtractFromResources(objectInfos, pageNr, page); - //doExtractAnnotations(objectInfos, pageNr, page); // does not work with pdf-box 0.7.2 -> 0.8.0 needed - } - if (logger_.isDebugEnabled()) { - logger_.debug("extracted non textual objects count: " + objectInfos.size()); - } - logger_.debug("going to extract non text objects"); - return objectInfos; - - } - - private static void doExtractAnnotations(List objectInfos, int pageNr, PDPage page) { - List annotations; - try { - annotations = page.getAnnotations(); - } catch (IOException e) { - logger_.error("Error extracting annotations from pdf. No NonTextObjectInfo-annotations available.", e); - return; - } - for (Iterator it = annotations.iterator(); it.hasNext();) { - try { - PDAnnotation anno = (PDAnnotation) it.next(); - NonTextObjectInfo objInfo = new NonTextObjectInfo(); - objInfo.setName(anno.getDictionary().getString( "NM" )); - objInfo.setObjectType(NonTextObjectInfo.TYPE_ANNOTATION); - objInfo.setSubType(anno.getDictionary().getString("Subtype") + - "/" + anno.getDictionary().getString("Subj")); - objInfo.setPageNr(pageNr); - objInfo.setHeight(anno.getRectangle().getHeight()); - objInfo.setWidth(anno.getRectangle().getWidth()); - } catch (Exception ex) { - logger_.info("error reading non text object info key " + ex); - } - } - } - - private static void doExtractFromResources(List objectInfos, int pageNr, PDPage page) { - PDResources resources = page.getResources(); - - Map images; - try { - images = resources.getImages(); - } catch (IOException e) { - logger_.error("Error extracting images from pdf. No NonTextObjectInfo-Images available.", e); - return; - } - if (images != null) { - Iterator imageIter = images.keySet().iterator(); - while (imageIter.hasNext()) { - NonTextObjectInfo objInfo = new NonTextObjectInfo(); - String key = (String) imageIter.next(); - PDXObjectImage image = (PDXObjectImage) images.get(key); - System.err.println(image); - - objInfo.setHeight(image.getHeight()); - objInfo.setWidth(image.getWidth()); - objInfo.setName(key + image.getSuffix()); - objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE); - objInfo.setPageNr(pageNr); - objectInfos.add(objInfo); - - if (logger_.isDebugEnabled()) { - logger_.debug("Found non text object: " + objInfo.toString()); - } - } - } - } - - /** * Normalizes a given binary PDF to a version PDFbox can handle correctly. * -- cgit v1.2.3