/**
* Copyright 2006 by Know-Center, Graz, Austria
* PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
* joint initiative of the Federal Chancellery Austria and Graz University of
* Technology.
*
* Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
* the European Commission - subsequent versions of the EUPL (the "Licence");
* You may not use this work except in compliance with the Licence.
* You may obtain a copy of the Licence at:
* http://www.osor.eu/eupl/
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the Licence is distributed on an "AS IS" basis,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the Licence for the specific language governing permissions and
* limitations under the Licence.
*
* This product combines work with different licenses. See the "NOTICE" text
* file for details on the various modules and licenses.
* The "NOTICE" text file is part of the distribution. Any derivative works
* that you distribute must include a readable copy of the "NOTICE" text file.
*/
package at.knowcenter.wag.egov.egiz.pdf;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Logger;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import at.gv.egiz.pdfas.api.analyze.NonTextObjectInfo;
import at.gv.egiz.pdfas.framework.input.PdfDataSource;
/**
* Method for object extraction from pdf documents.
* This uses pdf-box 0.8.0, not 0.7.2. Packages org.apache.pdfbox instead org.pdfbox!
* @author dferbas
*
*/
public class ObjectExtractor {
private static Logger log = Logger.getLogger(ObjectExtractor.class);
/*
* If set true
signature annotations are not extracted otherwise
* all signatures except PDF-AS signatures are extracted.
*/
private final static boolean SKIP_NON_PDFAS_SIGNATURES = false;
/**
* Find annotation objects in pdf documents
* @param objectInfos
* @param pageNr
* @param page
*/
private static void doExtractAnnotations(List objectInfos, int pageNr, PDPage page) {
List annotations;
try {
annotations = page.getAnnotations();
} catch (IOException e) {
log.error("Error extracting annotations from pdf. No NonTextObjectInfo-annotations available.", e);
return;
}
for (Iterator it = annotations.iterator(); it.hasNext();) {
try {
PDAnnotation anno = (PDAnnotation) it.next();
log.debug("found annotation: " +anno);
if (log.isTraceEnabled()) {
log.trace("annotation def: " + dictToString(anno.getDictionary()));
}
String ft = anno.getDictionary().getNameAsString("FT");
if (ft != null && ft.equals("Sig")) { // skip signature widgets
if (SKIP_NON_PDFAS_SIGNATURES) {
log.debug("found signature widged, skip extraction");
continue;
} else {
COSDictionary sigDict = (COSDictionary) anno.getDictionary().getDictionaryObject("V");
if (sigDict != null && AdobeSignatureHelper.ADOBE_SIG_FILTER.equals(sigDict.getNameAsString("Filter"))) {
log.debug("found PDF-AS signature widged, skip extraction");
continue;
}
}
}
NonTextObjectInfo objInfo = new NonTextObjectInfo();
objInfo.setName(anno.getDictionary().getString( "NM" ));
objInfo.setObjectType(NonTextObjectInfo.TYPE_ANNOTATION);
String subtype = anno.getDictionary().getNameAsString("Subtype");
String subj = anno.getDictionary().getString("Subj");
if (subj != null) {
subtype+= "/" + subj;
}
objInfo.setSubType(subtype);
objInfo.setPageNr(pageNr);
objInfo.setHeight(anno.getRectangle().getHeight());
objInfo.setWidth(anno.getRectangle().getWidth());
objectInfos.add(objInfo);
} catch (Exception ex) {
log.info("error reading non text object info key " + ex);
}
}
}
/**
* Create string representation from COSDictionary
* @param dict
* @return
*/
public static String dictToString(COSDictionary dict)
{
try {
String retVal = "COSDictionary{";
for (int i = 0; i 0.8.0 needed
}
if (log.isDebugEnabled()) {
log.debug("extracted non textual objects count: " + objectInfos.size());
}
return objectInfos;
}
/**
* Extract non textual data from pdf.
* @param pdfDataSource
*
* @see org.pdfbox.ExtractImages
*
* @return List of {@link NonTextObjectInfo}
*/
public static List extractNonTextInfo(PdfDataSource pdfDataSource) {
PDDocument doc = null;
try {
doc = PDDocument.load(pdfDataSource.createInputStream());
List res = extractNonTextInfo(doc);
doc.close();
return res;
} catch (IOException e) {
log.error("Error extracting images from pdf. No NonTextObjectInfo available.", e);
return new ArrayList();
} finally {
if (doc != null) {
try {
doc.close();
} catch (IOException e) {
log.error("error closing pddocument", e);
}
}
}
}
}