From 535a04fa05f739ec16dd81666e3b0f82dfbd442d Mon Sep 17 00:00:00 2001 From: tknall Date: Wed, 9 Jan 2013 15:41:29 +0000 Subject: pdf-as-lib maven project files moved to pdf-as-lib git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../wag/egov/egiz/pdf/ObjectExtractor.java | 233 +++++++++++++++++++++ 1 file changed, 233 insertions(+) create mode 100644 pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java (limited to 'pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java') diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java new file mode 100644 index 0000000..eb7377a --- /dev/null +++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/ObjectExtractor.java @@ -0,0 +1,233 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + */ +package at.knowcenter.wag.egov.egiz.pdf; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.log4j.Logger; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; + +import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo; +import at.gv.egiz.pdfas.framework.input.PdfDataSource; + +/** + * Method for object extraction from pdf documents. + * This uses pdf-box 0.8.0, not 0.7.2. Packages org.apache.pdfbox instead org.pdfbox! + * @author dferbas + * + */ +public class ObjectExtractor { + private static Logger log = Logger.getLogger(ObjectExtractor.class); + + /* + * If set true signature annotations are not extracted otherwise + * all signatures except PDF-AS signatures are extracted. + */ + private final static boolean SKIP_NON_PDFAS_SIGNATURES = false; + + /** + * Find annotation objects in pdf documents + * @param objectInfos + * @param pageNr + * @param page + */ + private static void doExtractAnnotations(List objectInfos, int pageNr, PDPage page) { + List annotations; + try { + annotations = page.getAnnotations(); + } catch (IOException e) { + log.error("Error extracting annotations from pdf. No NonTextObjectInfo-annotations available.", e); + return; + } + for (Iterator it = annotations.iterator(); it.hasNext();) { + try { + PDAnnotation anno = (PDAnnotation) it.next(); + log.debug("found annotation: " +anno); + if (log.isTraceEnabled()) { + log.trace("annotation def: " + dictToString(anno.getDictionary())); + } + String ft = anno.getDictionary().getNameAsString("FT"); + if (ft != null && ft.equals("Sig")) { // skip signature widgets + if (SKIP_NON_PDFAS_SIGNATURES) { + log.debug("found signature widged, skip extraction"); + continue; + } else { + COSDictionary sigDict = (COSDictionary) anno.getDictionary().getDictionaryObject("V"); + if (sigDict != null && AdobeSignatureHelper.ADOBE_SIG_FILTER.equals(sigDict.getNameAsString("Filter"))) { + log.debug("found PDF-AS signature widged, skip extraction"); + continue; + } + } + } + + NonTextObjectInfo objInfo = new NonTextObjectInfo(); + objInfo.setName(anno.getDictionary().getString( "NM" )); + objInfo.setObjectType(NonTextObjectInfo.TYPE_ANNOTATION); + + String subtype = anno.getDictionary().getNameAsString("Subtype"); + String subj = anno.getDictionary().getString("Subj"); + if (subj != null) { + subtype+= "/" + subj; + } + objInfo.setSubType(subtype); + + objInfo.setPageNr(pageNr); + objInfo.setHeight(anno.getRectangle().getHeight()); + objInfo.setWidth(anno.getRectangle().getWidth()); + objectInfos.add(objInfo); + } catch (Exception ex) { + log.info("error reading non text object info key " + ex); + } + } + } + + /** + * Create string representation from COSDictionary + * @param dict + * @return + */ + public static String dictToString(COSDictionary dict) + { + try { + String retVal = "COSDictionary{"; + for (int i = 0; i 0.8.0 needed + } + if (log.isDebugEnabled()) { + log.debug("extracted non textual objects count: " + objectInfos.size()); + } + return objectInfos; + + } + + /** + * Extract non textual data from pdf. + * @param pdfDataSource + * + * @see org.pdfbox.ExtractImages + * + * @return List of {@link NonTextObjectInfo} + */ + public static List extractNonTextInfo(PdfDataSource pdfDataSource) { + + PDDocument doc = null; + try { + doc = PDDocument.load(pdfDataSource.createInputStream()); + List res = extractNonTextInfo(doc); + doc.close(); + return res; + } catch (IOException e) { + log.error("Error extracting images from pdf. No NonTextObjectInfo available.", e); + return new ArrayList(); + } finally { + if (doc != null) { + try { + doc.close(); + } catch (IOException e) { + log.error("error closing pddocument", e); + } + } + } + + } + +} -- cgit v1.2.3