From 0d9d384bfd15cce65d4077ae11951f3ce7f14dd6 Mon Sep 17 00:00:00 2001 From: ferbas Date: Wed, 2 Dec 2009 16:47:37 +0000 Subject: added annotation extraction with pdfbox 0.8.0 git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@475 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../wag/egov/egiz/pdf/ObjectExtractor.java | 171 +++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java new file mode 100644 index 0000000..f19a2f4 --- /dev/null +++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java @@ -0,0 +1,171 @@ +package at.knowcenter.wag.egov.egiz.pdf; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.log4j.Logger; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; + +import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo; +import at.gv.egiz.pdfas.framework.input.PdfDataSource; + +/** + * Method for object extraction from pdf documents. + * This uses pdf-box 0.8.0, not 0.7.2. Packages org.apache.pdfbox instead org.pdfbox! + * @author dferbas + * + */ +public class ObjectExtractor { + private static Logger log = Logger.getLogger(ObjectExtractor.class); + + /** + * Find annotation objects in pdf documents + * @param objectInfos + * @param pageNr + * @param page + */ + private static void doExtractAnnotations(List objectInfos, int pageNr, PDPage page) { + List annotations; + try { + annotations = page.getAnnotations(); + } catch (IOException e) { + log.error("Error extracting annotations from pdf. No NonTextObjectInfo-annotations available.", e); + return; + } + for (Iterator it = annotations.iterator(); it.hasNext();) { + try { + PDAnnotation anno = (PDAnnotation) it.next(); + log.debug("found annotation: " +anno); + String ft = anno.getDictionary().getNameAsString("FT"); + if (ft != null && ft.equals("Sig")) { // skip signature widgets + log.debug("found signature widged, skip further extraction"); + break; + } + NonTextObjectInfo objInfo = new NonTextObjectInfo(); + objInfo.setName(anno.getDictionary().getString( "NM" )); + objInfo.setObjectType(NonTextObjectInfo.TYPE_ANNOTATION); + + String subtype = anno.getDictionary().getNameAsString("Subtype"); + String subj = anno.getDictionary().getString("Subj"); + if (subj != null) { + subtype+= "/" + subj; + } + objInfo.setSubType(subtype); + + objInfo.setPageNr(pageNr); + objInfo.setHeight(anno.getRectangle().getHeight()); + objInfo.setWidth(anno.getRectangle().getWidth()); + objectInfos.add(objInfo); + } catch (Exception ex) { + log.info("error reading non text object info key " + ex); + } + } + } + + /** + * Find resources (images) in pdf documents + * @param objectInfos + * @param pageNr + * @param page + */ + private static void doExtractFromResources(List objectInfos, int pageNr, PDPage page) { + PDResources resources = page.getResources(); + + Map images; + try { + images = resources.getImages(); + } catch (IOException e) { + log.error("Error extracting images from pdf. No NonTextObjectInfo-Images available.", e); + return; + } + if (images != null) { + Iterator imageIter = images.keySet().iterator(); + while (imageIter.hasNext()) { + NonTextObjectInfo objInfo = new NonTextObjectInfo(); + String key = (String) imageIter.next(); + PDXObjectImage image = (PDXObjectImage) images.get(key); + System.err.println(image); + + objInfo.setHeight(image.getHeight()); + objInfo.setWidth(image.getWidth()); + objInfo.setName(key + image.getSuffix()); + objInfo.setObjectType(NonTextObjectInfo.TYPE_IMAGE); + objInfo.setPageNr(pageNr); + objectInfos.add(objInfo); + objInfo.setSubType(image.getSuffix()); + + if (log.isDebugEnabled()) { + log.debug("Found non text object: " + objInfo.toString()); + } + } + } + } + + /** + * Extract non textual data from pdf. + * + * @see org.pdfbox.ExtractImages + * + * @param document + * @return List of {@link NonTextObjectInfo} + */ + public static List extractNonTextInfo(PDDocument document) { + // extraction does not work with the normalized pdf from extractTextTextual + log.debug("going to extract non text objects"); + List objectInfos = new ArrayList(); + List pages = document.getDocumentCatalog().getAllPages(); + Iterator iter = pages.iterator(); + int pageNr = 0; + while (iter.hasNext()) { + pageNr++; + PDPage page = (PDPage) iter.next(); + doExtractFromResources(objectInfos, pageNr, page); + doExtractAnnotations(objectInfos, pageNr, page); // does not work with pdf-box 0.7.2 -> 0.8.0 needed + } + if (log.isDebugEnabled()) { + log.debug("extracted non textual objects count: " + objectInfos.size()); + } + log.debug("going to extract non text objects"); + return objectInfos; + + } + + /** + * Extract non textual data from pdf. + * @param pdfDataSource + * + * @see org.pdfbox.ExtractImages + * + * @return List of {@link NonTextObjectInfo} + */ + public static List extractNonTextInfo(PdfDataSource pdfDataSource) { + + PDDocument doc = null; + try { + doc = PDDocument.load(pdfDataSource.createInputStream()); + List res = extractNonTextInfo(doc); + doc.close(); + return res; + } catch (IOException e) { + log.error("Error extracting images from pdf. No NonTextObjectInfo available.", e); + return new ArrayList(); + } finally { + if (doc != null) { + try { + doc.close(); + } catch (IOException e) { + log.error("error closing pddocument", e); + } + } + } + + } + +} -- cgit v1.2.3