/** * Copyright 2006 by Know-Center, Graz, Austria * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a * joint initiative of the Federal Chancellery Austria and Graz University of * Technology. * * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by * the European Commission - subsequent versions of the EUPL (the "Licence"); * You may not use this work except in compliance with the Licence. * You may obtain a copy of the Licence at: * http://www.osor.eu/eupl/ * * Unless required by applicable law or agreed to in writing, software * distributed under the Licence is distributed on an "AS IS" basis, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the Licence for the specific language governing permissions and * limitations under the Licence. * * This product combines work with different licenses. See the "NOTICE" text * file for details on the various modules and licenses. * The "NOTICE" text file is part of the distribution. Any derivative works * that you distribute must include a readable copy of the "NOTICE" text file. */ package at.knowcenter.wag.egov.egiz.pdf; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.log4j.Logger; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo; import at.gv.egiz.pdfas.framework.input.PdfDataSource; /** * Method for object extraction from pdf documents. * This uses pdf-box 0.8.0, not 0.7.2. Packages org.apache.pdfbox instead org.pdfbox! * @author dferbas * */ public class ObjectExtractor { private static Logger log = Logger.getLogger(ObjectExtractor.class); /* * If set true signature annotations are not extracted otherwise * all signatures except PDF-AS signatures are extracted. */ private final static boolean SKIP_NON_PDFAS_SIGNATURES = false; /** * Find annotation objects in pdf documents * @param objectInfos * @param pageNr * @param page */ private static void doExtractAnnotations(List objectInfos, int pageNr, PDPage page) { List annotations; try { annotations = page.getAnnotations(); } catch (IOException e) { log.error("Error extracting annotations from pdf. No NonTextObjectInfo-annotations available.", e); return; } for (Iterator it = annotations.iterator(); it.hasNext();) { try { PDAnnotation anno = (PDAnnotation) it.next(); log.debug("found annotation: " +anno); if (log.isTraceEnabled()) { log.trace("annotation def: " + dictToString(anno.getDictionary())); } String ft = anno.getDictionary().getNameAsString("FT"); if (ft != null && ft.equals("Sig")) { // skip signature widgets if (SKIP_NON_PDFAS_SIGNATURES) { log.debug("found signature widged, skip extraction"); continue; } else { COSDictionary sigDict = (COSDictionary) anno.getDictionary().getDictionaryObject("V"); if (sigDict != null && AdobeSignatureHelper.ADOBE_SIG_FILTER.equals(sigDict.getNameAsString("Filter"))) { log.debug("found PDF-AS signature widged, skip extraction"); continue; } } } NonTextObjectInfo objInfo = new NonTextObjectInfo(); objInfo.setName(anno.getDictionary().getString( "NM" )); objInfo.setObjectType(NonTextObjectInfo.TYPE_ANNOTATION); String subtype = anno.getDictionary().getNameAsString("Subtype"); String subj = anno.getDictionary().getString("Subj"); if (subj != null) { subtype+= "/" + subj; } objInfo.setSubType(subtype); objInfo.setPageNr(pageNr); objInfo.setHeight(anno.getRectangle().getHeight()); objInfo.setWidth(anno.getRectangle().getWidth()); objectInfos.add(objInfo); } catch (Exception ex) { log.info("error reading non text object info key " + ex); } } } /** * Create string representation from COSDictionary * @param dict * @return */ public static String dictToString(COSDictionary dict) { try { String retVal = "COSDictionary{"; for (int i = 0; i 0.8.0 needed } if (log.isDebugEnabled()) { log.debug("extracted non textual objects count: " + objectInfos.size()); } return objectInfos; } /** * Extract non textual data from pdf. * @param pdfDataSource * * @see org.pdfbox.ExtractImages * * @return List of {@link NonTextObjectInfo} */ public static List extractNonTextInfo(PdfDataSource pdfDataSource) { PDDocument doc = null; try { doc = PDDocument.load(pdfDataSource.createInputStream()); List res = extractNonTextInfo(doc); doc.close(); return res; } catch (IOException e) { log.error("Error extracting images from pdf. No NonTextObjectInfo available.", e); return new ArrayList(); } finally { if (doc != null) { try { doc.close(); } catch (IOException e) { log.error("error closing pddocument", e); } } } } }